OpenStack Storage (Swift)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

2131 lines
102 KiB

  1. # Copyright (c) 2017 OpenStack Foundation
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  12. # implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import hashlib
  16. import json
  17. import os
  18. import shutil
  19. import uuid
  20. from nose import SkipTest
  21. import six
  22. from swift.common import direct_client, utils
  23. from swift.common.manager import Manager
  24. from swift.common.memcached import MemcacheRing
  25. from swift.common.direct_client import DirectClientException
  26. from swift.common.utils import ShardRange, parse_db_filename, get_db_files, \
  27. quorum_size, config_true_value, Timestamp
  28. from swift.container.backend import ContainerBroker, UNSHARDED, SHARDING
  29. from swiftclient import client, get_auth, ClientException
  30. from swift.proxy.controllers.base import get_cache_key
  31. from swift.proxy.controllers.obj import num_container_updates
  32. from test import annotate_failure
  33. from test.probe.brain import BrainSplitter
  34. from test.probe.common import ReplProbeTest, get_server_number, \
  35. wait_for_server_to_hangup
  36. MIN_SHARD_CONTAINER_THRESHOLD = 4
  37. MAX_SHARD_CONTAINER_THRESHOLD = 100
  38. class ShardCollector(object):
  39. """
  40. Returns map of node to tuples of (headers, shard ranges) returned from node
  41. """
  42. def __init__(self):
  43. self.ranges = {}
  44. def __call__(self, cnode, cpart, account, container):
  45. self.ranges[cnode['id']] = direct_client.direct_get_container(
  46. cnode, cpart, account, container,
  47. headers={'X-Backend-Record-Type': 'shard'})
  48. class BaseTestContainerSharding(ReplProbeTest):
  49. def _maybe_skip_test(self):
  50. try:
  51. cont_configs = [utils.readconf(p, 'container-sharder')
  52. for p in self.configs['container-server'].values()]
  53. except ValueError:
  54. raise SkipTest('No [container-sharder] section found in '
  55. 'container-server configs')
  56. skip_reasons = []
  57. auto_shard = all(config_true_value(c.get('auto_shard', False))
  58. for c in cont_configs)
  59. if not auto_shard:
  60. skip_reasons.append(
  61. 'auto_shard must be true in all container_sharder configs')
  62. self.max_shard_size = max(
  63. int(c.get('shard_container_threshold', '1000000'))
  64. for c in cont_configs)
  65. if not (MIN_SHARD_CONTAINER_THRESHOLD <= self.max_shard_size
  66. <= MAX_SHARD_CONTAINER_THRESHOLD):
  67. skip_reasons.append(
  68. 'shard_container_threshold %d must be between %d and %d' %
  69. (self.max_shard_size, MIN_SHARD_CONTAINER_THRESHOLD,
  70. MAX_SHARD_CONTAINER_THRESHOLD))
  71. def skip_check(reason_list, option, required):
  72. values = {int(c.get(option, required)) for c in cont_configs}
  73. if values != {required}:
  74. reason_list.append('%s must be %s' % (option, required))
  75. skip_check(skip_reasons, 'shard_scanner_batch_size', 10)
  76. skip_check(skip_reasons, 'shard_batch_size', 2)
  77. if skip_reasons:
  78. raise SkipTest(', '.join(skip_reasons))
  79. def _load_rings_and_configs(self):
  80. super(BaseTestContainerSharding, self)._load_rings_and_configs()
  81. # perform checks for skipping test before starting services
  82. self._maybe_skip_test()
  83. def _make_object_names(self, number):
  84. return ['obj-%04d' % x for x in range(number)]
  85. def _setup_container_name(self):
  86. self.container_name = 'container-%s' % uuid.uuid4()
  87. def setUp(self):
  88. client.logger.setLevel(client.logging.WARNING)
  89. client.requests.logging.getLogger().setLevel(
  90. client.requests.logging.WARNING)
  91. super(BaseTestContainerSharding, self).setUp()
  92. _, self.admin_token = get_auth(
  93. 'http://127.0.0.1:8080/auth/v1.0', 'admin:admin', 'admin')
  94. self._setup_container_name()
  95. self.brain = BrainSplitter(self.url, self.token, self.container_name,
  96. None, 'container')
  97. self.brain.put_container(policy_index=int(self.policy))
  98. self.sharders = Manager(['container-sharder'])
  99. self.internal_client = self.make_internal_client()
  100. self.memcache = MemcacheRing(['127.0.0.1:11211'])
  101. def stop_container_servers(self, node_numbers=None):
  102. if node_numbers:
  103. ipports = []
  104. server2ipport = {v: k for k, v in self.ipport2server.items()}
  105. for number in self.brain.node_numbers[node_numbers]:
  106. self.brain.servers.stop(number=number)
  107. server = 'container%d' % number
  108. ipports.append(server2ipport[server])
  109. else:
  110. ipports = [k for k, v in self.ipport2server.items()
  111. if v.startswith('container')]
  112. self.brain.servers.stop()
  113. for ipport in ipports:
  114. wait_for_server_to_hangup(ipport)
  115. def put_objects(self, obj_names, contents=None):
  116. for obj in obj_names:
  117. client.put_object(self.url, token=self.token,
  118. container=self.container_name, name=obj,
  119. contents=contents)
  120. def delete_objects(self, obj_names):
  121. for obj in obj_names:
  122. client.delete_object(
  123. self.url, self.token, self.container_name, obj)
  124. def get_container_shard_ranges(self, account=None, container=None):
  125. account = account if account else self.account
  126. container = container if container else self.container_name
  127. path = self.internal_client.make_path(account, container)
  128. resp = self.internal_client.make_request(
  129. 'GET', path + '?format=json', {'X-Backend-Record-Type': 'shard'},
  130. [200])
  131. return [ShardRange.from_dict(sr) for sr in json.loads(resp.body)]
  132. def direct_container_op(self, func, account=None, container=None,
  133. expect_failure=False):
  134. account = account if account else self.account
  135. container = container if container else self.container_name
  136. cpart, cnodes = self.container_ring.get_nodes(account, container)
  137. unexpected_responses = []
  138. results = {}
  139. for cnode in cnodes:
  140. try:
  141. results[cnode['id']] = func(cnode, cpart, account, container)
  142. except DirectClientException as err:
  143. if not expect_failure:
  144. unexpected_responses.append((cnode, err))
  145. else:
  146. if expect_failure:
  147. unexpected_responses.append((cnode, 'success'))
  148. if unexpected_responses:
  149. self.fail('Unexpected responses: %s' % unexpected_responses)
  150. return results
  151. def direct_get_container_shard_ranges(self, account=None, container=None,
  152. expect_failure=False):
  153. collector = ShardCollector()
  154. self.direct_container_op(
  155. collector, account, container, expect_failure)
  156. return collector.ranges
  157. def direct_delete_container(self, account=None, container=None,
  158. expect_failure=False):
  159. self.direct_container_op(direct_client.direct_delete_container,
  160. account, container, expect_failure)
  161. def direct_head_container(self, account=None, container=None,
  162. expect_failure=False):
  163. return self.direct_container_op(direct_client.direct_head_container,
  164. account, container, expect_failure)
  165. def direct_get_container(self, account=None, container=None,
  166. expect_failure=False):
  167. return self.direct_container_op(direct_client.direct_get_container,
  168. account, container, expect_failure)
  169. def get_storage_dir(self, part, node, account=None, container=None):
  170. account = account or self.brain.account
  171. container = container or self.container_name
  172. server_type, config_number = get_server_number(
  173. (node['ip'], node['port']), self.ipport2server)
  174. assert server_type == 'container'
  175. repl_server = '%s-replicator' % server_type
  176. conf = utils.readconf(self.configs[repl_server][config_number],
  177. section_name=repl_server)
  178. datadir = os.path.join(conf['devices'], node['device'], 'containers')
  179. container_hash = utils.hash_path(account, container)
  180. return (utils.storage_directory(datadir, part, container_hash),
  181. container_hash)
  182. def get_broker(self, part, node, account=None, container=None):
  183. container_dir, container_hash = self.get_storage_dir(
  184. part, node, account=account, container=container)
  185. db_file = os.path.join(container_dir, container_hash + '.db')
  186. self.assertTrue(get_db_files(db_file)) # sanity check
  187. return ContainerBroker(db_file)
  188. def categorize_container_dir_content(self, account=None, container=None):
  189. account = account or self.brain.account
  190. container = container or self.container_name
  191. part, nodes = self.brain.ring.get_nodes(account, container)
  192. storage_dirs = [
  193. self.get_storage_dir(part, node, account=account,
  194. container=container)[0]
  195. for node in nodes]
  196. result = {
  197. 'shard_dbs': [],
  198. 'normal_dbs': [],
  199. 'pendings': [],
  200. 'locks': [],
  201. 'other': [],
  202. }
  203. for storage_dir in storage_dirs:
  204. for f in os.listdir(storage_dir):
  205. path = os.path.join(storage_dir, f)
  206. if path.endswith('.db'):
  207. hash_, epoch, ext = parse_db_filename(path)
  208. if epoch:
  209. result['shard_dbs'].append(path)
  210. else:
  211. result['normal_dbs'].append(path)
  212. elif path.endswith('.db.pending'):
  213. result['pendings'].append(path)
  214. elif path.endswith('/.lock'):
  215. result['locks'].append(path)
  216. else:
  217. result['other'].append(path)
  218. if result['other']:
  219. self.fail('Found unexpected files in storage directory:\n %s' %
  220. '\n '.join(result['other']))
  221. return result
  222. def assertLengthEqual(self, obj, length):
  223. obj_len = len(obj)
  224. self.assertEqual(obj_len, length, 'len(%r) == %d, not %d' % (
  225. obj, obj_len, length))
  226. def assert_dict_contains(self, expected_items, actual_dict):
  227. ignored = set(expected_items) ^ set(actual_dict)
  228. filtered_actual = {k: actual_dict[k]
  229. for k in actual_dict if k not in ignored}
  230. self.assertEqual(expected_items, filtered_actual)
  231. def assert_shard_ranges_contiguous(self, expected_number, shard_ranges,
  232. first_lower='', last_upper=''):
  233. if shard_ranges and isinstance(shard_ranges[0], ShardRange):
  234. actual_shard_ranges = sorted(shard_ranges)
  235. else:
  236. actual_shard_ranges = sorted(ShardRange.from_dict(d)
  237. for d in shard_ranges)
  238. self.assertLengthEqual(actual_shard_ranges, expected_number)
  239. if expected_number:
  240. with annotate_failure('Ranges %s.' % actual_shard_ranges):
  241. self.assertEqual(first_lower, actual_shard_ranges[0].lower_str)
  242. for x, y in zip(actual_shard_ranges, actual_shard_ranges[1:]):
  243. self.assertEqual(x.upper, y.lower)
  244. self.assertEqual(last_upper, actual_shard_ranges[-1].upper_str)
  245. def assert_shard_range_equal(self, expected, actual, excludes=None):
  246. excludes = excludes or []
  247. expected_dict = dict(expected)
  248. actual_dict = dict(actual)
  249. for k in excludes:
  250. expected_dict.pop(k, None)
  251. actual_dict.pop(k, None)
  252. self.assertEqual(expected_dict, actual_dict)
  253. def assert_shard_range_lists_equal(self, expected, actual, excludes=None):
  254. self.assertEqual(len(expected), len(actual))
  255. for expected, actual in zip(expected, actual):
  256. self.assert_shard_range_equal(expected, actual, excludes=excludes)
  257. def assert_shard_range_state(self, expected_state, shard_ranges):
  258. if shard_ranges and not isinstance(shard_ranges[0], ShardRange):
  259. shard_ranges = [ShardRange.from_dict(data)
  260. for data in shard_ranges]
  261. self.assertEqual([expected_state] * len(shard_ranges),
  262. [sr.state for sr in shard_ranges])
  263. def assert_total_object_count(self, expected_object_count, shard_ranges):
  264. actual = sum(sr['object_count'] for sr in shard_ranges)
  265. self.assertEqual(expected_object_count, actual)
  266. def assert_container_listing(self, expected_listing):
  267. headers, actual_listing = client.get_container(
  268. self.url, self.token, self.container_name)
  269. self.assertIn('x-container-object-count', headers)
  270. expected_obj_count = len(expected_listing)
  271. self.assertEqual(expected_listing, [
  272. x['name'].encode('utf-8') if six.PY2 else x['name']
  273. for x in actual_listing])
  274. self.assertEqual(str(expected_obj_count),
  275. headers['x-container-object-count'])
  276. return headers, actual_listing
  277. def assert_container_object_count(self, expected_obj_count):
  278. headers = client.head_container(
  279. self.url, self.token, self.container_name)
  280. self.assertIn('x-container-object-count', headers)
  281. self.assertEqual(str(expected_obj_count),
  282. headers['x-container-object-count'])
  283. def assert_container_post_ok(self, meta_value):
  284. key = 'X-Container-Meta-Assert-Post-Works'
  285. headers = {key: meta_value}
  286. client.post_container(
  287. self.url, self.token, self.container_name, headers=headers)
  288. resp_headers = client.head_container(
  289. self.url, self.token, self.container_name)
  290. self.assertEqual(meta_value, resp_headers.get(key.lower()))
  291. def assert_container_post_fails(self, meta_value):
  292. key = 'X-Container-Meta-Assert-Post-Works'
  293. headers = {key: meta_value}
  294. with self.assertRaises(ClientException) as cm:
  295. client.post_container(
  296. self.url, self.token, self.container_name, headers=headers)
  297. self.assertEqual(404, cm.exception.http_status)
  298. def assert_container_delete_fails(self):
  299. with self.assertRaises(ClientException) as cm:
  300. client.delete_container(self.url, self.token, self.container_name)
  301. self.assertEqual(409, cm.exception.http_status)
  302. def assert_container_not_found(self):
  303. with self.assertRaises(ClientException) as cm:
  304. client.get_container(self.url, self.token, self.container_name)
  305. self.assertEqual(404, cm.exception.http_status)
  306. # check for headers leaking out while deleted
  307. resp_headers = cm.exception.http_response_headers
  308. self.assertNotIn('X-Container-Object-Count', resp_headers)
  309. self.assertNotIn('X-Container-Bytes-Used', resp_headers)
  310. self.assertNotIn('X-Timestamp', resp_headers)
  311. self.assertNotIn('X-PUT-Timestamp', resp_headers)
  312. def assert_container_has_shard_sysmeta(self):
  313. node_headers = self.direct_head_container()
  314. for node_id, headers in node_headers.items():
  315. with annotate_failure('%s in %s' % (node_id, node_headers.keys())):
  316. for k, v in headers.items():
  317. if k.lower().startswith('x-container-sysmeta-shard'):
  318. break
  319. else:
  320. self.fail('No shard sysmeta found in %s' % headers)
  321. def assert_container_state(self, node, expected_state, num_shard_ranges):
  322. headers, shard_ranges = direct_client.direct_get_container(
  323. node, self.brain.part, self.account, self.container_name,
  324. headers={'X-Backend-Record-Type': 'shard'})
  325. self.assertEqual(num_shard_ranges, len(shard_ranges))
  326. self.assertIn('X-Backend-Sharding-State', headers)
  327. self.assertEqual(
  328. expected_state, headers['X-Backend-Sharding-State'])
  329. return [ShardRange.from_dict(sr) for sr in shard_ranges]
  330. def get_part_and_node_numbers(self, shard_range):
  331. """Return the partition and node numbers for a shard range."""
  332. part, nodes = self.brain.ring.get_nodes(
  333. shard_range.account, shard_range.container)
  334. return part, [n['id'] + 1 for n in nodes]
  335. def run_sharders(self, shard_ranges):
  336. """Run the sharder on partitions for given shard ranges."""
  337. if not isinstance(shard_ranges, (list, tuple, set)):
  338. shard_ranges = (shard_ranges,)
  339. partitions = ','.join(str(self.get_part_and_node_numbers(sr)[0])
  340. for sr in shard_ranges)
  341. self.sharders.once(additional_args='--partitions=%s' % partitions)
  342. def run_sharder_sequentially(self, shard_range=None):
  343. """Run sharder node by node on partition for given shard range."""
  344. if shard_range:
  345. part, node_numbers = self.get_part_and_node_numbers(shard_range)
  346. else:
  347. part, node_numbers = self.brain.part, self.brain.node_numbers
  348. for node_number in node_numbers:
  349. self.sharders.once(number=node_number,
  350. additional_args='--partitions=%s' % part)
  351. class TestContainerShardingNonUTF8(BaseTestContainerSharding):
  352. def test_sharding_listing(self):
  353. # verify parameterised listing of a container during sharding
  354. all_obj_names = self._make_object_names(4 * self.max_shard_size)
  355. obj_names = all_obj_names[::2]
  356. self.put_objects(obj_names)
  357. # choose some names approx in middle of each expected shard range
  358. markers = [
  359. obj_names[i] for i in range(self.max_shard_size // 4,
  360. 2 * self.max_shard_size,
  361. self.max_shard_size // 2)]
  362. def check_listing(objects, **params):
  363. qs = '&'.join(['%s=%s' % param for param in params.items()])
  364. headers, listing = client.get_container(
  365. self.url, self.token, self.container_name, query_string=qs)
  366. listing = [x['name'].encode('utf-8') if six.PY2 else x['name']
  367. for x in listing]
  368. if params.get('reverse'):
  369. marker = params.get('marker', ShardRange.MAX)
  370. end_marker = params.get('end_marker', ShardRange.MIN)
  371. expected = [o for o in objects if end_marker < o < marker]
  372. expected.reverse()
  373. else:
  374. marker = params.get('marker', ShardRange.MIN)
  375. end_marker = params.get('end_marker', ShardRange.MAX)
  376. expected = [o for o in objects if marker < o < end_marker]
  377. if 'limit' in params:
  378. expected = expected[:params['limit']]
  379. self.assertEqual(expected, listing)
  380. def check_listing_precondition_fails(**params):
  381. qs = '&'.join(['%s=%s' % param for param in params.items()])
  382. with self.assertRaises(ClientException) as cm:
  383. client.get_container(
  384. self.url, self.token, self.container_name, query_string=qs)
  385. self.assertEqual(412, cm.exception.http_status)
  386. return cm.exception
  387. def do_listing_checks(objects):
  388. check_listing(objects)
  389. check_listing(objects, marker=markers[0], end_marker=markers[1])
  390. check_listing(objects, marker=markers[0], end_marker=markers[2])
  391. check_listing(objects, marker=markers[1], end_marker=markers[3])
  392. check_listing(objects, marker=markers[1], end_marker=markers[3],
  393. limit=self.max_shard_size // 4)
  394. check_listing(objects, marker=markers[1], end_marker=markers[3],
  395. limit=self.max_shard_size // 4)
  396. check_listing(objects, marker=markers[1], end_marker=markers[2],
  397. limit=self.max_shard_size // 2)
  398. check_listing(objects, marker=markers[1], end_marker=markers[1])
  399. check_listing(objects, reverse=True)
  400. check_listing(objects, reverse=True, end_marker=markers[1])
  401. check_listing(objects, reverse=True, marker=markers[3],
  402. end_marker=markers[1],
  403. limit=self.max_shard_size // 4)
  404. check_listing(objects, reverse=True, marker=markers[3],
  405. end_marker=markers[1], limit=0)
  406. check_listing([], marker=markers[0], end_marker=markers[0])
  407. check_listing([], marker=markers[0], end_marker=markers[1],
  408. reverse=True)
  409. check_listing(objects, prefix='obj')
  410. check_listing([], prefix='zzz')
  411. # delimiter
  412. headers, listing = client.get_container(
  413. self.url, self.token, self.container_name,
  414. query_string='delimiter=-')
  415. self.assertEqual([{'subdir': 'obj-'}], listing)
  416. limit = self.cluster_info['swift']['container_listing_limit']
  417. exc = check_listing_precondition_fails(limit=limit + 1)
  418. self.assertIn(b'Maximum limit', exc.http_response_content)
  419. exc = check_listing_precondition_fails(delimiter='ab')
  420. self.assertIn(b'Bad delimiter', exc.http_response_content)
  421. # sanity checks
  422. do_listing_checks(obj_names)
  423. # Shard the container
  424. client.post_container(self.url, self.admin_token, self.container_name,
  425. headers={'X-Container-Sharding': 'on'})
  426. # First run the 'leader' in charge of scanning, which finds all shard
  427. # ranges and cleaves first two
  428. self.sharders.once(number=self.brain.node_numbers[0],
  429. additional_args='--partitions=%s' % self.brain.part)
  430. # Then run sharder on other nodes which will also cleave first two
  431. # shard ranges
  432. for n in self.brain.node_numbers[1:]:
  433. self.sharders.once(
  434. number=n, additional_args='--partitions=%s' % self.brain.part)
  435. # sanity check shard range states
  436. for node in self.brain.nodes:
  437. self.assert_container_state(node, 'sharding', 4)
  438. shard_ranges = self.get_container_shard_ranges()
  439. self.assertLengthEqual(shard_ranges, 4)
  440. self.assert_shard_range_state(ShardRange.CLEAVED, shard_ranges[:2])
  441. self.assert_shard_range_state(ShardRange.CREATED, shard_ranges[2:])
  442. self.assert_container_delete_fails()
  443. self.assert_container_has_shard_sysmeta() # confirm no sysmeta deleted
  444. self.assert_container_post_ok('sharding')
  445. do_listing_checks(obj_names)
  446. # put some new objects spread through entire namespace
  447. new_obj_names = all_obj_names[1::4]
  448. self.put_objects(new_obj_names)
  449. # new objects that fell into the first two cleaved shard ranges are
  450. # reported in listing, new objects in the yet-to-be-cleaved shard
  451. # ranges are not yet included in listing
  452. exp_obj_names = [o for o in obj_names + new_obj_names
  453. if o <= shard_ranges[1].upper]
  454. exp_obj_names += [o for o in obj_names
  455. if o > shard_ranges[1].upper]
  456. exp_obj_names.sort()
  457. do_listing_checks(exp_obj_names)
  458. # run all the sharders again and the last two shard ranges get cleaved
  459. self.sharders.once(additional_args='--partitions=%s' % self.brain.part)
  460. for node in self.brain.nodes:
  461. self.assert_container_state(node, 'sharded', 4)
  462. shard_ranges = self.get_container_shard_ranges()
  463. self.assert_shard_range_state(ShardRange.ACTIVE, shard_ranges)
  464. exp_obj_names = obj_names + new_obj_names
  465. exp_obj_names.sort()
  466. do_listing_checks(exp_obj_names)
  467. self.assert_container_delete_fails()
  468. self.assert_container_has_shard_sysmeta()
  469. self.assert_container_post_ok('sharded')
  470. # delete original objects
  471. self.delete_objects(obj_names)
  472. do_listing_checks(new_obj_names)
  473. self.assert_container_delete_fails()
  474. self.assert_container_has_shard_sysmeta()
  475. self.assert_container_post_ok('sharded')
  476. class TestContainerShardingUTF8(TestContainerShardingNonUTF8):
  477. def _make_object_names(self, number):
  478. # override default with names that include non-ascii chars
  479. name_length = self.cluster_info['swift']['max_object_name_length']
  480. obj_names = []
  481. for x in range(number):
  482. name = (u'obj-\u00e4\u00ea\u00ec\u00f2\u00fb-%04d' % x)
  483. name = name.encode('utf8').ljust(name_length, b'o')
  484. if not six.PY2:
  485. name = name.decode('utf8')
  486. obj_names.append(name)
  487. return obj_names
  488. def _setup_container_name(self):
  489. # override default with max length name that includes non-ascii chars
  490. super(TestContainerShardingUTF8, self)._setup_container_name()
  491. name_length = self.cluster_info['swift']['max_container_name_length']
  492. cont_name = self.container_name + u'-\u00e4\u00ea\u00ec\u00f2\u00fb'
  493. self.conainer_name = cont_name.ljust(name_length, 'x')
  494. if six.PY2:
  495. self.conainer_name = self.container_name.encode('utf8')
  496. class TestContainerSharding(BaseTestContainerSharding):
  497. def _test_sharded_listing(self, run_replicators=False):
  498. obj_names = self._make_object_names(self.max_shard_size)
  499. self.put_objects(obj_names)
  500. # Verify that we start out with normal DBs, no shards
  501. found = self.categorize_container_dir_content()
  502. self.assertLengthEqual(found['normal_dbs'], 3)
  503. self.assertLengthEqual(found['shard_dbs'], 0)
  504. for db_file in found['normal_dbs']:
  505. broker = ContainerBroker(db_file)
  506. self.assertIs(True, broker.is_root_container())
  507. self.assertEqual('unsharded', broker.get_db_state())
  508. self.assertLengthEqual(broker.get_shard_ranges(), 0)
  509. headers, pre_sharding_listing = client.get_container(
  510. self.url, self.token, self.container_name)
  511. self.assertEqual(obj_names, [
  512. x['name'].encode('utf-8') if six.PY2 else x['name']
  513. for x in pre_sharding_listing]) # sanity
  514. # Shard it
  515. client.post_container(self.url, self.admin_token, self.container_name,
  516. headers={'X-Container-Sharding': 'on'})
  517. pre_sharding_headers = client.head_container(
  518. self.url, self.admin_token, self.container_name)
  519. self.assertEqual('True',
  520. pre_sharding_headers.get('x-container-sharding'))
  521. # Only run the one in charge of scanning
  522. self.sharders.once(number=self.brain.node_numbers[0],
  523. additional_args='--partitions=%s' % self.brain.part)
  524. # Verify that we have one sharded db -- though the other normal DBs
  525. # received the shard ranges that got defined
  526. found = self.categorize_container_dir_content()
  527. self.assertLengthEqual(found['shard_dbs'], 1)
  528. broker = self.get_broker(self.brain.part, self.brain.nodes[0])
  529. # sanity check - the shard db is on replica 0
  530. self.assertEqual(found['shard_dbs'][0], broker.db_file)
  531. self.assertIs(True, broker.is_root_container())
  532. self.assertEqual('sharded', broker.get_db_state())
  533. orig_root_shard_ranges = [dict(sr) for sr in broker.get_shard_ranges()]
  534. self.assertLengthEqual(orig_root_shard_ranges, 2)
  535. self.assert_total_object_count(len(obj_names), orig_root_shard_ranges)
  536. self.assert_shard_ranges_contiguous(2, orig_root_shard_ranges)
  537. self.assertEqual([ShardRange.ACTIVE, ShardRange.ACTIVE],
  538. [sr['state'] for sr in orig_root_shard_ranges])
  539. self.direct_delete_container(expect_failure=True)
  540. self.assertLengthEqual(found['normal_dbs'], 2)
  541. for db_file in found['normal_dbs']:
  542. broker = ContainerBroker(db_file)
  543. self.assertIs(True, broker.is_root_container())
  544. self.assertEqual('unsharded', broker.get_db_state())
  545. shard_ranges = [dict(sr) for sr in broker.get_shard_ranges()]
  546. self.assertEqual([ShardRange.CREATED, ShardRange.CREATED],
  547. [sr['state'] for sr in shard_ranges])
  548. # the sharded db had shard range meta_timestamps and state updated
  549. # during cleaving, so we do not expect those to be equal on other
  550. # nodes
  551. self.assert_shard_range_lists_equal(
  552. orig_root_shard_ranges, shard_ranges,
  553. excludes=['meta_timestamp', 'state', 'state_timestamp'])
  554. if run_replicators:
  555. Manager(['container-replicator']).once()
  556. # replication doesn't change the db file names
  557. found = self.categorize_container_dir_content()
  558. self.assertLengthEqual(found['shard_dbs'], 1)
  559. self.assertLengthEqual(found['normal_dbs'], 2)
  560. # Now that everyone has shard ranges, run *everyone*
  561. self.sharders.once(additional_args='--partitions=%s' % self.brain.part)
  562. # Verify that we only have shard dbs now
  563. found = self.categorize_container_dir_content()
  564. self.assertLengthEqual(found['shard_dbs'], 3)
  565. self.assertLengthEqual(found['normal_dbs'], 0)
  566. # Shards stayed the same
  567. for db_file in found['shard_dbs']:
  568. broker = ContainerBroker(db_file)
  569. self.assertIs(True, broker.is_root_container())
  570. self.assertEqual('sharded', broker.get_db_state())
  571. # Well, except for meta_timestamps, since the shards each reported
  572. self.assert_shard_range_lists_equal(
  573. orig_root_shard_ranges, broker.get_shard_ranges(),
  574. excludes=['meta_timestamp', 'state_timestamp'])
  575. for orig, updated in zip(orig_root_shard_ranges,
  576. broker.get_shard_ranges()):
  577. self.assertGreaterEqual(updated.state_timestamp,
  578. orig['state_timestamp'])
  579. self.assertGreaterEqual(updated.meta_timestamp,
  580. orig['meta_timestamp'])
  581. # Check that entire listing is available
  582. headers, actual_listing = self.assert_container_listing(obj_names)
  583. # ... and check some other container properties
  584. self.assertEqual(headers['last-modified'],
  585. pre_sharding_headers['last-modified'])
  586. # It even works in reverse!
  587. headers, listing = client.get_container(self.url, self.token,
  588. self.container_name,
  589. query_string='reverse=on')
  590. self.assertEqual(pre_sharding_listing[::-1], listing)
  591. # Now put some new objects into first shard, taking its count to
  592. # 3 shard ranges' worth
  593. more_obj_names = [
  594. 'beta%03d' % x for x in range(self.max_shard_size)]
  595. self.put_objects(more_obj_names)
  596. # The listing includes new objects...
  597. headers, listing = self.assert_container_listing(
  598. more_obj_names + obj_names)
  599. self.assertEqual(pre_sharding_listing, listing[len(more_obj_names):])
  600. # ...but root object count is out of date until the sharders run and
  601. # update the root
  602. self.assert_container_object_count(len(obj_names))
  603. # run sharders on the shard to get root updated
  604. shard_1 = ShardRange.from_dict(orig_root_shard_ranges[0])
  605. self.run_sharders(shard_1)
  606. self.assert_container_object_count(len(more_obj_names + obj_names))
  607. # we've added objects enough that we need to shard the first shard
  608. # *again* into three new sub-shards, but nothing happens until the root
  609. # leader identifies shard candidate...
  610. root_shard_ranges = self.direct_get_container_shard_ranges()
  611. for node, (hdrs, root_shards) in root_shard_ranges.items():
  612. self.assertLengthEqual(root_shards, 2)
  613. with annotate_failure('node %s. ' % node):
  614. self.assertEqual(
  615. [ShardRange.ACTIVE] * 2,
  616. [sr['state'] for sr in root_shards])
  617. # orig shards 0, 1 should be contiguous
  618. self.assert_shard_ranges_contiguous(2, root_shards)
  619. # Now run the root leader to identify shard candidate...while one of
  620. # the shard container servers is down
  621. shard_1_part, shard_1_nodes = self.get_part_and_node_numbers(shard_1)
  622. self.brain.servers.stop(number=shard_1_nodes[2])
  623. self.sharders.once(number=self.brain.node_numbers[0],
  624. additional_args='--partitions=%s' % self.brain.part)
  625. # ... so third replica of first shard state is not moved to sharding
  626. found_for_shard = self.categorize_container_dir_content(
  627. shard_1.account, shard_1.container)
  628. self.assertLengthEqual(found_for_shard['normal_dbs'], 3)
  629. self.assertEqual(
  630. [ShardRange.SHARDING, ShardRange.SHARDING, ShardRange.ACTIVE],
  631. [ContainerBroker(db_file).get_own_shard_range().state
  632. for db_file in found_for_shard['normal_dbs']])
  633. # ...then run first cycle of first shard sharders in order, leader
  634. # first, to get to predictable state where all nodes have cleaved 2 out
  635. # of 3 ranges...starting with first two nodes
  636. for node_number in shard_1_nodes[:2]:
  637. self.sharders.once(
  638. number=node_number,
  639. additional_args='--partitions=%s' % shard_1_part)
  640. # ... first two replicas start sharding to sub-shards
  641. found_for_shard = self.categorize_container_dir_content(
  642. shard_1.account, shard_1.container)
  643. self.assertLengthEqual(found_for_shard['shard_dbs'], 2)
  644. for db_file in found_for_shard['shard_dbs'][:2]:
  645. broker = ContainerBroker(db_file)
  646. with annotate_failure('shard db file %s. ' % db_file):
  647. self.assertIs(False, broker.is_root_container())
  648. self.assertEqual('sharding', broker.get_db_state())
  649. self.assertEqual(
  650. ShardRange.SHARDING, broker.get_own_shard_range().state)
  651. shard_shards = broker.get_shard_ranges()
  652. self.assertEqual(
  653. [ShardRange.CLEAVED, ShardRange.CLEAVED,
  654. ShardRange.CREATED],
  655. [sr.state for sr in shard_shards])
  656. self.assert_shard_ranges_contiguous(
  657. 3, shard_shards,
  658. first_lower=orig_root_shard_ranges[0]['lower'],
  659. last_upper=orig_root_shard_ranges[0]['upper'])
  660. # but third replica still has no idea it should be sharding
  661. self.assertLengthEqual(found_for_shard['normal_dbs'], 3)
  662. self.assertEqual(
  663. ShardRange.ACTIVE,
  664. ContainerBroker(
  665. found_for_shard['normal_dbs'][2]).get_own_shard_range().state)
  666. # ...but once sharder runs on third replica it will learn its state;
  667. # note that any root replica on the stopped container server also won't
  668. # know about the shards being in sharding state, so leave that server
  669. # stopped for now so that shard fetches its state from an up-to-date
  670. # root replica
  671. self.sharders.once(
  672. number=shard_1_nodes[2],
  673. additional_args='--partitions=%s' % shard_1_part)
  674. # third replica is sharding but has no sub-shard ranges yet...
  675. found_for_shard = self.categorize_container_dir_content(
  676. shard_1.account, shard_1.container)
  677. self.assertLengthEqual(found_for_shard['shard_dbs'], 2)
  678. self.assertLengthEqual(found_for_shard['normal_dbs'], 3)
  679. broker = ContainerBroker(found_for_shard['normal_dbs'][2])
  680. self.assertEqual('unsharded', broker.get_db_state())
  681. self.assertEqual(
  682. ShardRange.SHARDING, broker.get_own_shard_range().state)
  683. self.assertFalse(broker.get_shard_ranges())
  684. # ...until sub-shard ranges are replicated from another shard replica;
  685. # there may also be a sub-shard replica missing so run replicators on
  686. # all nodes to fix that if necessary
  687. self.brain.servers.start(number=shard_1_nodes[2])
  688. self.replicators.once()
  689. # now run sharder again on third replica
  690. self.sharders.once(
  691. number=shard_1_nodes[2],
  692. additional_args='--partitions=%s' % shard_1_part)
  693. # check original first shard range state and sub-shards - all replicas
  694. # should now be in consistent state
  695. found_for_shard = self.categorize_container_dir_content(
  696. shard_1.account, shard_1.container)
  697. self.assertLengthEqual(found_for_shard['shard_dbs'], 3)
  698. self.assertLengthEqual(found_for_shard['normal_dbs'], 3)
  699. for db_file in found_for_shard['shard_dbs']:
  700. broker = ContainerBroker(db_file)
  701. with annotate_failure('shard db file %s. ' % db_file):
  702. self.assertIs(False, broker.is_root_container())
  703. self.assertEqual('sharding', broker.get_db_state())
  704. self.assertEqual(
  705. ShardRange.SHARDING, broker.get_own_shard_range().state)
  706. shard_shards = broker.get_shard_ranges()
  707. self.assertEqual(
  708. [ShardRange.CLEAVED, ShardRange.CLEAVED,
  709. ShardRange.CREATED],
  710. [sr.state for sr in shard_shards])
  711. self.assert_shard_ranges_contiguous(
  712. 3, shard_shards,
  713. first_lower=orig_root_shard_ranges[0]['lower'],
  714. last_upper=orig_root_shard_ranges[0]['upper'])
  715. # check third sub-shard is in created state
  716. sub_shard = shard_shards[2]
  717. found_for_sub_shard = self.categorize_container_dir_content(
  718. sub_shard.account, sub_shard.container)
  719. self.assertFalse(found_for_sub_shard['shard_dbs'])
  720. self.assertLengthEqual(found_for_sub_shard['normal_dbs'], 3)
  721. for db_file in found_for_sub_shard['normal_dbs']:
  722. broker = ContainerBroker(db_file)
  723. with annotate_failure('sub shard db file %s. ' % db_file):
  724. self.assertIs(False, broker.is_root_container())
  725. self.assertEqual('unsharded', broker.get_db_state())
  726. self.assertEqual(
  727. ShardRange.CREATED, broker.get_own_shard_range().state)
  728. self.assertFalse(broker.get_shard_ranges())
  729. # check root shard ranges
  730. root_shard_ranges = self.direct_get_container_shard_ranges()
  731. for node, (hdrs, root_shards) in root_shard_ranges.items():
  732. self.assertLengthEqual(root_shards, 5)
  733. with annotate_failure('node %s. ' % node):
  734. # shard ranges are sorted by upper, state, lower, so expect:
  735. # sub-shards, orig shard 0, orig shard 1
  736. self.assertEqual(
  737. [ShardRange.CLEAVED, ShardRange.CLEAVED,
  738. ShardRange.CREATED, ShardRange.SHARDING,
  739. ShardRange.ACTIVE],
  740. [sr['state'] for sr in root_shards])
  741. # sub-shards 0, 1, 2, orig shard 1 should be contiguous
  742. self.assert_shard_ranges_contiguous(
  743. 4, root_shards[:3] + root_shards[4:])
  744. # orig shards 0, 1 should be contiguous
  745. self.assert_shard_ranges_contiguous(2, root_shards[3:])
  746. self.assert_container_listing(more_obj_names + obj_names)
  747. self.assert_container_object_count(len(more_obj_names + obj_names))
  748. # Before writing, kill the cache
  749. self.memcache.delete(get_cache_key(
  750. self.account, self.container_name, shard='updating'))
  751. # add another object that lands in the first of the new sub-shards
  752. self.put_objects(['alpha'])
  753. # check that alpha object is in the first new shard
  754. shard_listings = self.direct_get_container(shard_shards[0].account,
  755. shard_shards[0].container)
  756. for node, (hdrs, listing) in shard_listings.items():
  757. with annotate_failure(node):
  758. self.assertIn('alpha', [o['name'] for o in listing])
  759. self.assert_container_listing(['alpha'] + more_obj_names + obj_names)
  760. # Run sharders again so things settle.
  761. self.run_sharders(shard_1)
  762. # check original first shard range shards
  763. for db_file in found_for_shard['shard_dbs']:
  764. broker = ContainerBroker(db_file)
  765. with annotate_failure('shard db file %s. ' % db_file):
  766. self.assertIs(False, broker.is_root_container())
  767. self.assertEqual('sharded', broker.get_db_state())
  768. self.assertEqual(
  769. [ShardRange.ACTIVE] * 3,
  770. [sr.state for sr in broker.get_shard_ranges()])
  771. # check root shard ranges
  772. root_shard_ranges = self.direct_get_container_shard_ranges()
  773. for node, (hdrs, root_shards) in root_shard_ranges.items():
  774. # old first shard range should have been deleted
  775. self.assertLengthEqual(root_shards, 4)
  776. with annotate_failure('node %s. ' % node):
  777. self.assertEqual(
  778. [ShardRange.ACTIVE] * 4,
  779. [sr['state'] for sr in root_shards])
  780. self.assert_shard_ranges_contiguous(4, root_shards)
  781. headers, final_listing = self.assert_container_listing(
  782. ['alpha'] + more_obj_names + obj_names)
  783. # check root
  784. found = self.categorize_container_dir_content()
  785. self.assertLengthEqual(found['shard_dbs'], 3)
  786. self.assertLengthEqual(found['normal_dbs'], 0)
  787. new_shard_ranges = None
  788. for db_file in found['shard_dbs']:
  789. broker = ContainerBroker(db_file)
  790. self.assertIs(True, broker.is_root_container())
  791. self.assertEqual('sharded', broker.get_db_state())
  792. if new_shard_ranges is None:
  793. new_shard_ranges = broker.get_shard_ranges(
  794. include_deleted=True)
  795. self.assertLengthEqual(new_shard_ranges, 5)
  796. # Second half is still there, and unchanged
  797. self.assertIn(
  798. dict(orig_root_shard_ranges[1], meta_timestamp=None,
  799. state_timestamp=None),
  800. [dict(sr, meta_timestamp=None, state_timestamp=None)
  801. for sr in new_shard_ranges])
  802. # But the first half split in three, then deleted
  803. by_name = {sr.name: sr for sr in new_shard_ranges}
  804. self.assertIn(orig_root_shard_ranges[0]['name'], by_name)
  805. old_shard_range = by_name.pop(
  806. orig_root_shard_ranges[0]['name'])
  807. self.assertTrue(old_shard_range.deleted)
  808. self.assert_shard_ranges_contiguous(4, list(by_name.values()))
  809. else:
  810. # Everyone's on the same page. Well, except for
  811. # meta_timestamps, since the shards each reported
  812. other_shard_ranges = broker.get_shard_ranges(
  813. include_deleted=True)
  814. self.assert_shard_range_lists_equal(
  815. new_shard_ranges, other_shard_ranges,
  816. excludes=['meta_timestamp', 'state_timestamp'])
  817. for orig, updated in zip(orig_root_shard_ranges,
  818. other_shard_ranges):
  819. self.assertGreaterEqual(updated.meta_timestamp,
  820. orig['meta_timestamp'])
  821. self.assert_container_delete_fails()
  822. for obj in final_listing:
  823. client.delete_object(
  824. self.url, self.token, self.container_name, obj['name'])
  825. # the objects won't be listed anymore
  826. self.assert_container_listing([])
  827. # but root container stats will not yet be aware of the deletions
  828. self.assert_container_delete_fails()
  829. # One server was down while the shard sharded its first two sub-shards,
  830. # so there may be undeleted handoff db(s) for sub-shard(s) that were
  831. # not fully replicated; run replicators now to clean up so they no
  832. # longer report bogus stats to root.
  833. self.replicators.once()
  834. # Run sharder so that shard containers update the root. Do not run
  835. # sharder on root container because that triggers shrinks which can
  836. # cause root object count to temporarily be non-zero and prevent the
  837. # final delete.
  838. self.run_sharders(self.get_container_shard_ranges())
  839. # then root is empty and can be deleted
  840. self.assert_container_listing([])
  841. self.assert_container_object_count(0)
  842. client.delete_container(self.url, self.token, self.container_name)
  843. def test_sharded_listing_no_replicators(self):
  844. self._test_sharded_listing()
  845. def test_sharded_listing_with_replicators(self):
  846. self._test_sharded_listing(run_replicators=True)
  847. def test_async_pendings(self):
  848. obj_names = self._make_object_names(self.max_shard_size * 2)
  849. # There are some updates *everyone* gets
  850. self.put_objects(obj_names[::5])
  851. # But roll some outages so each container only get ~2/5 more object
  852. # records i.e. total of 3/5 updates per container; and async pendings
  853. # pile up
  854. for i, n in enumerate(self.brain.node_numbers, start=1):
  855. self.brain.servers.stop(number=n)
  856. self.put_objects(obj_names[i::5])
  857. self.brain.servers.start(number=n)
  858. # But there are also 1/5 updates *no one* gets
  859. self.brain.servers.stop()
  860. self.put_objects(obj_names[4::5])
  861. self.brain.servers.start()
  862. # Shard it
  863. client.post_container(self.url, self.admin_token, self.container_name,
  864. headers={'X-Container-Sharding': 'on'})
  865. headers = client.head_container(self.url, self.admin_token,
  866. self.container_name)
  867. self.assertEqual('True', headers.get('x-container-sharding'))
  868. # sanity check
  869. found = self.categorize_container_dir_content()
  870. self.assertLengthEqual(found['shard_dbs'], 0)
  871. self.assertLengthEqual(found['normal_dbs'], 3)
  872. for db_file in found['normal_dbs']:
  873. broker = ContainerBroker(db_file)
  874. self.assertIs(True, broker.is_root_container())
  875. self.assertEqual(len(obj_names) * 3 // 5,
  876. broker.get_info()['object_count'])
  877. # Only run the 'leader' in charge of scanning.
  878. # Each container has ~2 * max * 3/5 objects
  879. # which are distributed from obj000 to obj<2 * max - 1>,
  880. # so expect 3 shard ranges to be found: the first two will be complete
  881. # shards with max/2 objects and lower/upper bounds spaced by approx:
  882. # (2 * max - 1)/(2 * max * 3/5) * (max/2) =~ 5/6 * max
  883. #
  884. # Note that during this shard cycle the leader replicates to other
  885. # nodes so they will end up with ~2 * max * 4/5 objects.
  886. self.sharders.once(number=self.brain.node_numbers[0],
  887. additional_args='--partitions=%s' % self.brain.part)
  888. # Verify that we have one shard db -- though the other normal DBs
  889. # received the shard ranges that got defined
  890. found = self.categorize_container_dir_content()
  891. self.assertLengthEqual(found['shard_dbs'], 1)
  892. node_index_zero_db = found['shard_dbs'][0]
  893. broker = ContainerBroker(node_index_zero_db)
  894. self.assertIs(True, broker.is_root_container())
  895. self.assertEqual(SHARDING, broker.get_db_state())
  896. expected_shard_ranges = broker.get_shard_ranges()
  897. self.assertLengthEqual(expected_shard_ranges, 3)
  898. self.assertEqual(
  899. [ShardRange.CLEAVED, ShardRange.CLEAVED, ShardRange.CREATED],
  900. [sr.state for sr in expected_shard_ranges])
  901. # Still have all three big DBs -- we've only cleaved 2 of the 3 shard
  902. # ranges that got defined
  903. self.assertLengthEqual(found['normal_dbs'], 3)
  904. db_states = []
  905. for db_file in found['normal_dbs']:
  906. broker = ContainerBroker(db_file)
  907. self.assertIs(True, broker.is_root_container())
  908. db_states.append(broker.get_db_state())
  909. # the sharded db had shard range meta_timestamps updated during
  910. # cleaving, so we do not expect those to be equal on other nodes
  911. self.assert_shard_range_lists_equal(
  912. expected_shard_ranges, broker.get_shard_ranges(),
  913. excludes=['meta_timestamp', 'state_timestamp', 'state'])
  914. self.assertEqual(len(obj_names) * 3 // 5,
  915. broker.get_info()['object_count'])
  916. self.assertEqual([SHARDING, UNSHARDED, UNSHARDED], sorted(db_states))
  917. # Run the other sharders so we're all in (roughly) the same state
  918. for n in self.brain.node_numbers[1:]:
  919. self.sharders.once(
  920. number=n,
  921. additional_args='--partitions=%s' % self.brain.part)
  922. found = self.categorize_container_dir_content()
  923. self.assertLengthEqual(found['shard_dbs'], 3)
  924. self.assertLengthEqual(found['normal_dbs'], 3)
  925. for db_file in found['normal_dbs']:
  926. broker = ContainerBroker(db_file)
  927. self.assertEqual(SHARDING, broker.get_db_state())
  928. # no new rows
  929. self.assertEqual(len(obj_names) * 3 // 5,
  930. broker.get_info()['object_count'])
  931. # Run updaters to clear the async pendings
  932. Manager(['object-updater']).once()
  933. # Our "big" dbs didn't take updates
  934. for db_file in found['normal_dbs']:
  935. broker = ContainerBroker(db_file)
  936. self.assertEqual(len(obj_names) * 3 // 5,
  937. broker.get_info()['object_count'])
  938. # confirm that the async pending updates got redirected to the shards
  939. for sr in expected_shard_ranges:
  940. shard_listings = self.direct_get_container(sr.account,
  941. sr.container)
  942. for node, (hdrs, listing) in shard_listings.items():
  943. shard_listing_names = [o['name'] for o in listing]
  944. for obj in obj_names[4::5]:
  945. if obj in sr:
  946. self.assertIn(obj, shard_listing_names)
  947. else:
  948. self.assertNotIn(obj, shard_listing_names)
  949. # The entire listing is not yet available - we have two cleaved shard
  950. # ranges, complete with async updates, but for the remainder of the
  951. # namespace only what landed in the original container
  952. headers, listing = client.get_container(self.url, self.token,
  953. self.container_name)
  954. start_listing = [
  955. o for o in obj_names if o <= expected_shard_ranges[1].upper]
  956. self.assertEqual(
  957. [x['name'].encode('utf-8') if six.PY2 else x['name']
  958. for x in listing[:len(start_listing)]],
  959. start_listing)
  960. # we can't assert much about the remaining listing, other than that
  961. # there should be something
  962. self.assertTrue(
  963. [x['name'].encode('utf-8') if six.PY2 else x['name']
  964. for x in listing[len(start_listing):]])
  965. self.assertIn('x-container-object-count', headers)
  966. self.assertEqual(str(len(listing)),
  967. headers['x-container-object-count'])
  968. headers, listing = client.get_container(self.url, self.token,
  969. self.container_name,
  970. query_string='reverse=on')
  971. self.assertEqual([x['name'].encode('utf-8') if six.PY2 else x['name']
  972. for x in listing[-len(start_listing):]],
  973. list(reversed(start_listing)))
  974. self.assertIn('x-container-object-count', headers)
  975. self.assertEqual(str(len(listing)),
  976. headers['x-container-object-count'])
  977. self.assertTrue(
  978. [x['name'].encode('utf-8') if six.PY2 else x['name']
  979. for x in listing[:-len(start_listing)]])
  980. # Run the sharders again to get everything to settle
  981. self.sharders.once()
  982. found = self.categorize_container_dir_content()
  983. self.assertLengthEqual(found['shard_dbs'], 3)
  984. self.assertLengthEqual(found['normal_dbs'], 0)
  985. # now all shards have been cleaved we should get the complete listing
  986. headers, listing = client.get_container(self.url, self.token,
  987. self.container_name)
  988. self.assertEqual([x['name'].encode('utf-8') if six.PY2 else x['name']
  989. for x in listing],
  990. obj_names)
  991. def test_shrinking(self):
  992. int_client = self.make_internal_client()
  993. def check_node_data(node_data, exp_hdrs, exp_obj_count, exp_shards):
  994. hdrs, range_data = node_data
  995. self.assert_dict_contains(exp_hdrs, hdrs)
  996. self.assert_shard_ranges_contiguous(exp_shards, range_data)
  997. self.assert_total_object_count(exp_obj_count, range_data)
  998. def check_shard_nodes_data(node_data, expected_state='unsharded',
  999. expected_shards=0, exp_obj_count=0):
  1000. # checks that shard range is consistent on all nodes
  1001. root_path = '%s/%s' % (self.account, self.container_name)
  1002. exp_shard_hdrs = {'X-Container-Sysmeta-Shard-Root': root_path,
  1003. 'X-Backend-Sharding-State': expected_state}
  1004. object_counts = []
  1005. bytes_used = []
  1006. for node_id, node_data in node_data.items():
  1007. with annotate_failure('Node id %s.' % node_id):
  1008. check_node_data(
  1009. node_data, exp_shard_hdrs, exp_obj_count,
  1010. expected_shards)
  1011. hdrs = node_data[0]
  1012. object_counts.append(int(hdrs['X-Container-Object-Count']))
  1013. bytes_used.append(int(hdrs['X-Container-Bytes-Used']))
  1014. if len(set(object_counts)) != 1:
  1015. self.fail('Inconsistent object counts: %s' % object_counts)
  1016. if len(set(bytes_used)) != 1:
  1017. self.fail('Inconsistent bytes used: %s' % bytes_used)
  1018. return object_counts[0], bytes_used[0]
  1019. repeat = [0]
  1020. def do_shard_then_shrink():
  1021. repeat[0] += 1
  1022. obj_names = ['obj-%s-%03d' % (repeat[0], x)
  1023. for x in range(self.max_shard_size)]
  1024. self.put_objects(obj_names)
  1025. # these two object names will fall at start of first shard range...
  1026. alpha = 'alpha-%s' % repeat[0]
  1027. beta = 'beta-%s' % repeat[0]
  1028. # Enable sharding
  1029. client.post_container(
  1030. self.url, self.admin_token, self.container_name,
  1031. headers={'X-Container-Sharding': 'on'})
  1032. # sanity check
  1033. self.assert_container_listing(obj_names)
  1034. # Only run the one in charge of scanning
  1035. self.sharders.once(
  1036. number=self.brain.node_numbers[0],
  1037. additional_args='--partitions=%s' % self.brain.part)
  1038. # check root container
  1039. root_nodes_data = self.direct_get_container_shard_ranges()
  1040. self.assertEqual(3, len(root_nodes_data))
  1041. # nodes on which sharder has not run are still in unsharded state
  1042. # but have had shard ranges replicated to them
  1043. exp_obj_count = len(obj_names)
  1044. exp_hdrs = {'X-Backend-Sharding-State': 'unsharded',
  1045. 'X-Container-Object-Count': str(exp_obj_count)}
  1046. node_id = self.brain.node_numbers[1] - 1
  1047. check_node_data(
  1048. root_nodes_data[node_id], exp_hdrs, exp_obj_count, 2)
  1049. node_id = self.brain.node_numbers[2] - 1
  1050. check_node_data(
  1051. root_nodes_data[node_id], exp_hdrs, exp_obj_count, 2)
  1052. # only one that ran sharder is in sharded state
  1053. exp_hdrs['X-Backend-Sharding-State'] = 'sharded'
  1054. node_id = self.brain.node_numbers[0] - 1
  1055. check_node_data(
  1056. root_nodes_data[node_id], exp_hdrs, exp_obj_count, 2)
  1057. orig_range_data = root_nodes_data[node_id][1]
  1058. orig_shard_ranges = [ShardRange.from_dict(r)
  1059. for r in orig_range_data]
  1060. # check first shard
  1061. shard_nodes_data = self.direct_get_container_shard_ranges(
  1062. orig_shard_ranges[0].account, orig_shard_ranges[0].container)
  1063. obj_count, bytes_used = check_shard_nodes_data(shard_nodes_data)
  1064. total_shard_object_count = obj_count
  1065. # check second shard
  1066. shard_nodes_data = self.direct_get_container_shard_ranges(
  1067. orig_shard_ranges[1].account, orig_shard_ranges[1].container)
  1068. obj_count, bytes_used = check_shard_nodes_data(shard_nodes_data)
  1069. total_shard_object_count += obj_count
  1070. self.assertEqual(exp_obj_count, total_shard_object_count)
  1071. # Now that everyone has shard ranges, run *everyone*
  1072. self.sharders.once(
  1073. additional_args='--partitions=%s' % self.brain.part)
  1074. # all root container nodes should now be in sharded state
  1075. root_nodes_data = self.direct_get_container_shard_ranges()
  1076. self.assertEqual(3, len(root_nodes_data))
  1077. for node_id, node_data in root_nodes_data.items():
  1078. with annotate_failure('Node id %s.' % node_id):
  1079. check_node_data(node_data, exp_hdrs, exp_obj_count, 2)
  1080. # run updaters to update .sharded account; shard containers have
  1081. # not updated account since having objects replicated to them
  1082. self.updaters.once()
  1083. shard_cont_count, shard_obj_count = int_client.get_account_info(
  1084. orig_shard_ranges[0].account, [204])
  1085. self.assertEqual(2 * repeat[0], shard_cont_count)
  1086. # the shards account should always have zero object count to avoid
  1087. # double accounting
  1088. self.assertEqual(0, shard_obj_count)
  1089. # checking the listing also refreshes proxy container info cache so
  1090. # that the proxy becomes aware that container is sharded and will
  1091. # now look up the shard target for subsequent updates
  1092. self.assert_container_listing(obj_names)
  1093. # Before writing, kill the cache
  1094. self.memcache.delete(get_cache_key(
  1095. self.account, self.container_name, shard='updating'))
  1096. # delete objects from first shard range
  1097. first_shard_objects = [obj_name for obj_name in obj_names
  1098. if obj_name <= orig_shard_ranges[0].upper]
  1099. for obj in first_shard_objects:
  1100. client.delete_object(
  1101. self.url, self.token, self.container_name, obj)
  1102. with self.assertRaises(ClientException):
  1103. client.get_object(
  1104. self.url, self.token, self.container_name, obj)
  1105. second_shard_objects = [obj_name for obj_name in obj_names
  1106. if obj_name > orig_shard_ranges[1].lower]
  1107. self.assert_container_listing(second_shard_objects)
  1108. self.put_objects([alpha])
  1109. second_shard_objects = [obj_name for obj_name in obj_names
  1110. if obj_name > orig_shard_ranges[1].lower]
  1111. self.assert_container_listing([alpha] + second_shard_objects)
  1112. # while container servers are down, but proxy has container info in
  1113. # cache from recent listing, put another object; this update will
  1114. # lurk in async pending until the updaters run again; because all
  1115. # the root container servers are down and therefore cannot respond
  1116. # to a GET for a redirect target, the object update will default to
  1117. # being targeted at the root container
  1118. self.stop_container_servers()
  1119. # Before writing, kill the cache
  1120. self.memcache.delete(get_cache_key(
  1121. self.account, self.container_name, shard='updating'))
  1122. self.put_objects([beta])
  1123. self.brain.servers.start()
  1124. async_pendings = self.gather_async_pendings(
  1125. self.get_all_object_nodes())
  1126. num_container_replicas = len(self.brain.nodes)
  1127. num_obj_replicas = self.policy.object_ring.replica_count
  1128. expected_num_updates = num_container_updates(
  1129. num_container_replicas, quorum_size(num_container_replicas),
  1130. num_obj_replicas, self.policy.quorum)
  1131. expected_num_pendings = min(expected_num_updates, num_obj_replicas)
  1132. # sanity check
  1133. with annotate_failure('policy %s. ' % self.policy):
  1134. self.assertLengthEqual(async_pendings, expected_num_pendings)
  1135. # root object count is not updated...
  1136. self.assert_container_object_count(len(obj_names))
  1137. self.assert_container_listing([alpha] + second_shard_objects)
  1138. root_nodes_data = self.direct_get_container_shard_ranges()
  1139. self.assertEqual(3, len(root_nodes_data))
  1140. for node_id, node_data in root_nodes_data.items():
  1141. with annotate_failure('Node id %s.' % node_id):
  1142. check_node_data(node_data, exp_hdrs, exp_obj_count, 2)
  1143. range_data = node_data[1]
  1144. self.assert_shard_range_lists_equal(
  1145. orig_range_data, range_data,
  1146. excludes=['meta_timestamp', 'state_timestamp'])
  1147. # ...until the sharders run and update root
  1148. self.run_sharders(orig_shard_ranges[0])
  1149. exp_obj_count = len(second_shard_objects) + 1
  1150. self.assert_container_object_count(exp_obj_count)
  1151. self.assert_container_listing([alpha] + second_shard_objects)
  1152. # root sharder finds donor, acceptor pair and pushes changes
  1153. self.sharders.once(
  1154. additional_args='--partitions=%s' % self.brain.part)
  1155. self.assert_container_listing([alpha] + second_shard_objects)
  1156. # run sharder on donor to shrink and replicate to acceptor
  1157. self.run_sharders(orig_shard_ranges[0])
  1158. self.assert_container_listing([alpha] + second_shard_objects)
  1159. # run sharder on acceptor to update root with stats
  1160. self.run_sharders(orig_shard_ranges[1])
  1161. self.assert_container_listing([alpha] + second_shard_objects)
  1162. self.assert_container_object_count(len(second_shard_objects) + 1)
  1163. # check root container
  1164. root_nodes_data = self.direct_get_container_shard_ranges()
  1165. self.assertEqual(3, len(root_nodes_data))
  1166. exp_hdrs['X-Container-Object-Count'] = str(exp_obj_count)
  1167. for node_id, node_data in root_nodes_data.items():
  1168. with annotate_failure('Node id %s.' % node_id):
  1169. # NB now only *one* shard range in root
  1170. check_node_data(node_data, exp_hdrs, exp_obj_count, 1)
  1171. # the acceptor shard is intact..
  1172. shard_nodes_data = self.direct_get_container_shard_ranges(
  1173. orig_shard_ranges[1].account, orig_shard_ranges[1].container)
  1174. obj_count, bytes_used = check_shard_nodes_data(shard_nodes_data)
  1175. # all objects should now be in this shard
  1176. self.assertEqual(exp_obj_count, obj_count)
  1177. # the donor shard is also still intact
  1178. donor = orig_shard_ranges[0]
  1179. shard_nodes_data = self.direct_get_container_shard_ranges(
  1180. donor.account, donor.container)
  1181. # the donor's shard range will have the acceptor's projected stats
  1182. obj_count, bytes_used = check_shard_nodes_data(
  1183. shard_nodes_data, expected_state='sharded', expected_shards=1,
  1184. exp_obj_count=len(second_shard_objects) + 1)
  1185. # but the donor is empty and so reports zero stats
  1186. self.assertEqual(0, obj_count)
  1187. self.assertEqual(0, bytes_used)
  1188. # check the donor own shard range state
  1189. part, nodes = self.brain.ring.get_nodes(
  1190. donor.account, donor.container)
  1191. for node in nodes:
  1192. with annotate_failure(node):
  1193. broker = self.get_broker(
  1194. part, node, donor.account, donor.container)
  1195. own_sr = broker.get_own_shard_range()
  1196. self.assertEqual(ShardRange.SHARDED, own_sr.state)
  1197. self.assertTrue(own_sr.deleted)
  1198. # delete all the second shard's object apart from 'alpha'
  1199. for obj in second_shard_objects:
  1200. client.delete_object(
  1201. self.url, self.token, self.container_name, obj)
  1202. self.assert_container_listing([alpha])
  1203. # runs sharders so second range shrinks away, requires up to 3
  1204. # cycles
  1205. self.sharders.once() # shard updates root stats
  1206. self.assert_container_listing([alpha])
  1207. self.sharders.once() # root finds shrinkable shard
  1208. self.assert_container_listing([alpha])
  1209. self.sharders.once() # shards shrink themselves
  1210. self.assert_container_listing([alpha])
  1211. # the second shard range has sharded and is empty
  1212. shard_nodes_data = self.direct_get_container_shard_ranges(
  1213. orig_shard_ranges[1].account, orig_shard_ranges[1].container)
  1214. check_shard_nodes_data(
  1215. shard_nodes_data, expected_state='sharded', expected_shards=1,
  1216. exp_obj_count=1)
  1217. # check root container
  1218. root_nodes_data = self.direct_get_container_shard_ranges()
  1219. self.assertEqual(3, len(root_nodes_data))
  1220. exp_hdrs = {'X-Backend-Sharding-State': 'collapsed',
  1221. # just the alpha object
  1222. 'X-Container-Object-Count': '1'}
  1223. for node_id, node_data in root_nodes_data.items():
  1224. with annotate_failure('Node id %s.' % node_id):
  1225. # NB now no shard ranges in root
  1226. check_node_data(node_data, exp_hdrs, 0, 0)
  1227. # delete the alpha object
  1228. client.delete_object(
  1229. self.url, self.token, self.container_name, alpha)
  1230. # should now be able to delete the *apparently* empty container
  1231. client.delete_container(self.url, self.token, self.container_name)
  1232. self.assert_container_not_found()
  1233. self.direct_head_container(expect_failure=True)
  1234. # and the container stays deleted even after sharders run and shard
  1235. # send updates
  1236. self.sharders.once()
  1237. self.assert_container_not_found()
  1238. self.direct_head_container(expect_failure=True)
  1239. # now run updaters to deal with the async pending for the beta
  1240. # object
  1241. self.updaters.once()
  1242. # and the container is revived!
  1243. self.assert_container_listing([beta])
  1244. # finally, clear out the container
  1245. client.delete_object(
  1246. self.url, self.token, self.container_name, beta)
  1247. do_shard_then_shrink()
  1248. # repeat from starting point of a collapsed and previously deleted
  1249. # container
  1250. do_shard_then_shrink()
  1251. def _setup_replication_scenario(self, num_shards, extra_objs=('alpha',)):
  1252. # Get cluster to state where 2 replicas are sharding or sharded but 3rd
  1253. # replica is unsharded and has an object that the first 2 are missing.
  1254. # put objects while all servers are up
  1255. obj_names = self._make_object_names(
  1256. num_shards * self.max_shard_size // 2)
  1257. self.put_objects(obj_names)
  1258. client.post_container(self.url, self.admin_token, self.container_name,
  1259. headers={'X-Container-Sharding': 'on'})
  1260. node_numbers = self.brain.node_numbers
  1261. # run replicators first time to get sync points set
  1262. self.replicators.once()
  1263. # stop the leader node and one other server
  1264. self.stop_container_servers(slice(0, 2))
  1265. # ...then put one more object in first shard range namespace
  1266. self.put_objects(extra_objs)
  1267. # start leader and first other server, stop third server
  1268. for number in node_numbers[:2]:
  1269. self.brain.servers.start(number=number)
  1270. self.brain.servers.stop(number=node_numbers[2])
  1271. self.assert_container_listing(obj_names) # sanity check
  1272. # shard the container - first two shard ranges are cleaved
  1273. for number in node_numbers[:2]:
  1274. self.sharders.once(
  1275. number=number,
  1276. additional_args='--partitions=%s' % self.brain.part)
  1277. self.assert_container_listing(obj_names) # sanity check
  1278. return obj_names
  1279. def test_replication_to_sharding_container(self):
  1280. # verify that replication from an unsharded replica to a sharding
  1281. # replica does not replicate rows but does replicate shard ranges
  1282. obj_names = self._setup_replication_scenario(3)
  1283. for node in self.brain.nodes[:2]:
  1284. self.assert_container_state(node, 'sharding', 3)
  1285. # bring third server back up, run replicator
  1286. node_numbers = self.brain.node_numbers
  1287. self.brain.servers.start(number=node_numbers[2])
  1288. # sanity check...
  1289. self.assert_container_state(self.brain.nodes[2], 'unsharded', 0)
  1290. self.replicators.once(number=node_numbers[2])
  1291. # check db files unchanged
  1292. found = self.categorize_container_dir_content()
  1293. self.assertLengthEqual(found['shard_dbs'], 2)
  1294. self.assertLengthEqual(found['normal_dbs'], 3)
  1295. # the 'alpha' object is NOT replicated to the two sharded nodes
  1296. for node in self.brain.nodes[:2]:
  1297. broker = self.get_broker(self.brain.part, node)
  1298. with annotate_failure(
  1299. 'Node id %s in %s' % (node['id'], self.brain.nodes[:2])):
  1300. self.assertFalse(broker.get_objects())
  1301. self.assert_container_state(node, 'sharding', 3)
  1302. self.brain.servers.stop(number=node_numbers[2])
  1303. self.assert_container_listing(obj_names)
  1304. # all nodes now have shard ranges
  1305. self.brain.servers.start(number=node_numbers[2])
  1306. node_data = self.direct_get_container_shard_ranges()
  1307. for node, (hdrs, shard_ranges) in node_data.items():
  1308. with annotate_failure(node):
  1309. self.assert_shard_ranges_contiguous(3, shard_ranges)
  1310. # complete cleaving third shard range on first two nodes
  1311. self.brain.servers.stop(number=node_numbers[2])
  1312. for number in node_numbers[:2]:
  1313. self.sharders.once(
  1314. number=number,
  1315. additional_args='--partitions=%s' % self.brain.part)
  1316. # ...and now they are in sharded state
  1317. self.assert_container_state(self.brain.nodes[0], 'sharded', 3)
  1318. self.assert_container_state(self.brain.nodes[1], 'sharded', 3)
  1319. # ...still no 'alpha' object in listing
  1320. self.assert_container_listing(obj_names)
  1321. # run the sharder on the third server, alpha object is included in
  1322. # shards that it cleaves
  1323. self.brain.servers.start(number=node_numbers[2])
  1324. self.assert_container_state(self.brain.nodes[2], 'unsharded', 3)
  1325. self.sharders.once(number=node_numbers[2],
  1326. additional_args='--partitions=%s' % self.brain.part)
  1327. self.assert_container_state(self.brain.nodes[2], 'sharding', 3)
  1328. self.sharders.once(number=node_numbers[2],
  1329. additional_args='--partitions=%s' % self.brain.part)
  1330. self.assert_container_state(self.brain.nodes[2], 'sharded', 3)
  1331. self.assert_container_listing(['alpha'] + obj_names)
  1332. def test_replication_to_sharded_container(self):
  1333. # verify that replication from an unsharded replica to a sharded
  1334. # replica does not replicate rows but does replicate shard ranges
  1335. obj_names = self._setup_replication_scenario(2)
  1336. for node in self.brain.nodes[:2]:
  1337. self.assert_container_state(node, 'sharded', 2)
  1338. # sanity check
  1339. found = self.categorize_container_dir_content()
  1340. self.assertLengthEqual(found['shard_dbs'], 2)
  1341. self.assertLengthEqual(found['normal_dbs'], 1)
  1342. for node in self.brain.nodes[:2]:
  1343. broker = self.get_broker(self.brain.part, node)
  1344. info = broker.get_info()
  1345. with annotate_failure(
  1346. 'Node id %s in %s' % (node['id'], self.brain.nodes[:2])):
  1347. self.assertEqual(len(obj_names), info['object_count'])
  1348. self.assertFalse(broker.get_objects())
  1349. # bring third server back up, run replicator
  1350. node_numbers = self.brain.node_numbers
  1351. self.brain.servers.start(number=node_numbers[2])
  1352. # sanity check...
  1353. self.assert_container_state(self.brain.nodes[2], 'unsharded', 0)
  1354. self.replicators.once(number=node_numbers[2])
  1355. # check db files unchanged
  1356. found = self.categorize_container_dir_content()
  1357. self.assertLengthEqual(found['shard_dbs'], 2)
  1358. self.assertLengthEqual(found['normal_dbs'], 1)
  1359. # the 'alpha' object is NOT replicated to the two sharded nodes
  1360. for node in self.brain.nodes[:2]:
  1361. broker = self.get_broker(self.brain.part, node)
  1362. with annotate_failure(
  1363. 'Node id %s in %s' % (node['id'], self.brain.nodes[:2])):
  1364. self.assertFalse(broker.get_objects())
  1365. self.assert_container_state(node, 'sharded', 2)
  1366. self.brain.servers.stop(number=node_numbers[2])
  1367. self.assert_container_listing(obj_names)
  1368. # all nodes now have shard ranges
  1369. self.brain.servers.start(number=node_numbers[2])
  1370. node_data = self.direct_get_container_shard_ranges()
  1371. for node, (hdrs, shard_ranges) in node_data.items():
  1372. with annotate_failure(node):
  1373. self.assert_shard_ranges_contiguous(2, shard_ranges)
  1374. # run the sharder on the third server, alpha object is included in
  1375. # shards that it cleaves
  1376. self.assert_container_state(self.brain.nodes[2], 'unsharded', 2)
  1377. self.sharders.once(number=node_numbers[2],
  1378. additional_args='--partitions=%s' % self.brain.part)
  1379. self.assert_container_state(self.brain.nodes[2], 'sharded', 2)
  1380. self.assert_container_listing(['alpha'] + obj_names)
  1381. def test_sharding_requires_sufficient_replication(self):
  1382. # verify that cleaving only progresses if each cleaved shard range is
  1383. # sufficiently replicated
  1384. # put enough objects for 4 shard ranges
  1385. obj_names = self._make_object_names(2 * self.max_shard_size)
  1386. self.put_objects(obj_names)
  1387. client.post_container(self.url, self.admin_token, self.container_name,
  1388. headers={'X-Container-Sharding': 'on'})
  1389. node_numbers = self.brain.node_numbers
  1390. leader_node = self.brain.nodes[0]
  1391. leader_num = node_numbers[0]
  1392. # run replicators first time to get sync points set
  1393. self.replicators.once()
  1394. # start sharding on the leader node
  1395. self.sharders.once(number=leader_num,
  1396. additional_args='--partitions=%s' % self.brain.part)
  1397. shard_ranges = self.assert_container_state(leader_node, 'sharding', 4)
  1398. self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 2,
  1399. [sr.state for sr in shard_ranges])
  1400. # stop *all* container servers for third shard range
  1401. sr_part, sr_node_nums = self.get_part_and_node_numbers(shard_ranges[2])
  1402. for node_num in sr_node_nums:
  1403. self.brain.servers.stop(number=node_num)
  1404. # attempt to continue sharding on the leader node
  1405. self.sharders.once(number=leader_num,
  1406. additional_args='--partitions=%s' % self.brain.part)
  1407. # no cleaving progress was made
  1408. for node_num in sr_node_nums:
  1409. self.brain.servers.start(number=node_num)
  1410. shard_ranges = self.assert_container_state(leader_node, 'sharding', 4)
  1411. self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 2,
  1412. [sr.state for sr in shard_ranges])
  1413. # stop two of the servers for third shard range, not including any
  1414. # server that happens to be the leader node
  1415. stopped = []
  1416. for node_num in sr_node_nums:
  1417. if node_num != leader_num:
  1418. self.brain.servers.stop(number=node_num)
  1419. stopped.append(node_num)
  1420. if len(stopped) >= 2:
  1421. break
  1422. self.assertLengthEqual(stopped, 2) # sanity check
  1423. # attempt to continue sharding on the leader node
  1424. self.sharders.once(number=leader_num,
  1425. additional_args='--partitions=%s' % self.brain.part)
  1426. # no cleaving progress was made
  1427. for node_num in stopped:
  1428. self.brain.servers.start(number=node_num)
  1429. shard_ranges = self.assert_container_state(leader_node, 'sharding', 4)
  1430. self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 2,
  1431. [sr.state for sr in shard_ranges])
  1432. # stop just one of the servers for third shard range
  1433. stopped = []
  1434. for node_num in sr_node_nums:
  1435. if node_num != leader_num:
  1436. self.brain.servers.stop(number=node_num)
  1437. stopped.append(node_num)
  1438. break
  1439. self.assertLengthEqual(stopped, 1) # sanity check
  1440. # attempt to continue sharding the container
  1441. self.sharders.once(number=leader_num,
  1442. additional_args='--partitions=%s' % self.brain.part)
  1443. # this time cleaving completed
  1444. self.brain.servers.start(number=stopped[0])
  1445. shard_ranges = self.assert_container_state(leader_node, 'sharded', 4)
  1446. self.assertEqual([ShardRange.ACTIVE] * 4,
  1447. [sr.state for sr in shard_ranges])
  1448. def test_sharded_delete(self):
  1449. all_obj_names = self._make_object_names(self.max_shard_size)
  1450. self.put_objects(all_obj_names)
  1451. # Shard the container
  1452. client.post_container(self.url, self.admin_token, self.container_name,
  1453. headers={'X-Container-Sharding': 'on'})
  1454. for n in self.brain.node_numbers:
  1455. self.sharders.once(
  1456. number=n, additional_args='--partitions=%s' % self.brain.part)
  1457. # sanity checks
  1458. for node in self.brain.nodes:
  1459. self.assert_container_state(node, 'sharded', 2)
  1460. self.assert_container_delete_fails()
  1461. self.assert_container_has_shard_sysmeta()
  1462. self.assert_container_post_ok('sharded')
  1463. self.assert_container_listing(all_obj_names)
  1464. # delete all objects - updates redirected to shards
  1465. self.delete_objects(all_obj_names)
  1466. self.assert_container_listing([])
  1467. self.assert_container_post_ok('has objects')
  1468. # root not yet updated with shard stats
  1469. self.assert_container_object_count(len(all_obj_names))
  1470. self.assert_container_delete_fails()
  1471. self.assert_container_has_shard_sysmeta()
  1472. # run sharder on shard containers to update root stats
  1473. shard_ranges = self.get_container_shard_ranges()
  1474. self.assertLengthEqual(shard_ranges, 2)
  1475. self.run_sharders(shard_ranges)
  1476. self.assert_container_listing([])
  1477. self.assert_container_post_ok('empty')
  1478. self.assert_container_object_count(0)
  1479. # put a new object - update redirected to shard
  1480. self.put_objects(['alpha'])
  1481. self.assert_container_listing(['alpha'])
  1482. self.assert_container_object_count(0)
  1483. # before root learns about new object in shard, delete the container
  1484. client.delete_container(self.url, self.token, self.container_name)
  1485. self.assert_container_post_fails('deleted')
  1486. self.assert_container_not_found()
  1487. # run the sharders to update root with shard stats
  1488. self.run_sharders(shard_ranges)
  1489. self.assert_container_listing(['alpha'])
  1490. self.assert_container_object_count(1)
  1491. self.assert_container_delete_fails()
  1492. self.assert_container_post_ok('revived')
  1493. def test_object_update_redirection(self):
  1494. all_obj_names = self._make_object_names(self.max_shard_size)
  1495. self.put_objects(all_obj_names)
  1496. # Shard the container
  1497. client.post_container(self.url, self.admin_token, self.container_name,
  1498. headers={'X-Container-Sharding': 'on'})
  1499. for n in self.brain.node_numbers:
  1500. self.sharders.once(
  1501. number=n, additional_args='--partitions=%s' % self.brain.part)
  1502. # sanity checks
  1503. for node in self.brain.nodes:
  1504. self.assert_container_state(node, 'sharded', 2)
  1505. self.assert_container_delete_fails()
  1506. self.assert_container_has_shard_sysmeta()
  1507. self.assert_container_post_ok('sharded')
  1508. self.assert_container_listing(all_obj_names)
  1509. # delete all objects - updates redirected to shards
  1510. self.delete_objects(all_obj_names)
  1511. self.assert_container_listing([])
  1512. self.assert_container_post_ok('has objects')
  1513. # run sharder on shard containers to update root stats
  1514. shard_ranges = self.get_container_shard_ranges()
  1515. self.assertLengthEqual(shard_ranges, 2)
  1516. self.run_sharders(shard_ranges)
  1517. self.assert_container_object_count(0)
  1518. # First, test a misplaced object moving from one shard to another.
  1519. # with one shard server down, put a new 'alpha' object...
  1520. shard_part, shard_nodes = self.get_part_and_node_numbers(
  1521. shard_ranges[0])
  1522. self.brain.servers.stop(number=shard_nodes[2])
  1523. self.put_objects(['alpha'])
  1524. self.assert_container_listing(['alpha'])
  1525. self.assert_container_object_count(0)
  1526. self.assertLengthEqual(
  1527. self.gather_async_pendings(self.get_all_object_nodes()), 1)
  1528. self.brain.servers.start(number=shard_nodes[2])
  1529. # run sharder on root to discover first shrink candidate
  1530. self.sharders.once(additional_args='--partitions=%s' % self.brain.part)
  1531. # then run sharder on the shard node without the alpha object
  1532. self.sharders.once(additional_args='--partitions=%s' % shard_part,
  1533. number=shard_nodes[2])
  1534. # root sees first shard has shrunk, only second shard range used for
  1535. # listing so alpha object not in listing
  1536. self.assertLengthEqual(self.get_container_shard_ranges(), 1)
  1537. self.assert_container_listing([])
  1538. self.assert_container_object_count(0)
  1539. # run the updaters: the async pending update will be redirected from
  1540. # shrunk shard to second shard
  1541. self.updaters.once()
  1542. self.assert_container_listing(['alpha'])
  1543. self.assert_container_object_count(0) # root not yet updated
  1544. # then run sharder on other shard nodes to complete shrinking
  1545. for number in shard_nodes[:2]:
  1546. self.sharders.once(additional_args='--partitions=%s' % shard_part,
  1547. number=number)
  1548. # and get root updated
  1549. self.run_sharders(shard_ranges[1])
  1550. self.assert_container_listing(['alpha'])
  1551. self.assert_container_object_count(1)
  1552. self.assertLengthEqual(self.get_container_shard_ranges(), 1)
  1553. # Now we have just one active shard, test a misplaced object moving
  1554. # from that shard to the root.
  1555. # with one shard server down, delete 'alpha' and put a 'beta' object...
  1556. shard_part, shard_nodes = self.get_part_and_node_numbers(
  1557. shard_ranges[1])
  1558. self.brain.servers.stop(number=shard_nodes[2])
  1559. # Before writing, kill the cache
  1560. self.memcache.delete(get_cache_key(
  1561. self.account, self.container_name, shard='updating'))
  1562. self.delete_objects(['alpha'])
  1563. self.put_objects(['beta'])
  1564. self.assert_container_listing(['beta'])
  1565. self.assert_container_object_count(1)
  1566. self.assertLengthEqual(
  1567. self.gather_async_pendings(self.get_all_object_nodes()), 2)
  1568. self.brain.servers.start(number=shard_nodes[2])
  1569. # run sharder on root to discover second shrink candidate - root is not
  1570. # yet aware of the beta object
  1571. self.sharders.once(additional_args='--partitions=%s' % self.brain.part)
  1572. # then run sharder on the shard node without the beta object, to shrink
  1573. # it to root - note this moves stale copy of alpha to the root db
  1574. self.sharders.once(additional_args='--partitions=%s' % shard_part,
  1575. number=shard_nodes[2])
  1576. # now there are no active shards
  1577. self.assertFalse(self.get_container_shard_ranges())
  1578. # with other two shard servers down, listing won't find beta object
  1579. for number in shard_nodes[:2]:
  1580. self.brain.servers.stop(number=number)
  1581. self.assert_container_listing(['alpha'])
  1582. self.assert_container_object_count(1)
  1583. # run the updaters: the async pending update will be redirected from
  1584. # shrunk shard to the root
  1585. self.updaters.once()
  1586. self.assert_container_listing(['beta'])
  1587. self.assert_container_object_count(1)
  1588. def test_misplaced_object_movement(self):
  1589. def merge_object(shard_range, name, deleted=0):
  1590. # it's hard to get a test to put a misplaced object into a shard,
  1591. # so this hack is used force an object record directly into a shard
  1592. # container db. Note: the actual object won't exist, we're just
  1593. # using this to test object records in container dbs.
  1594. shard_part, shard_nodes = self.brain.ring.get_nodes(
  1595. shard_range.account, shard_range.container)
  1596. shard_broker = self.get_broker(
  1597. shard_part, shard_nodes[0], shard_range.account,
  1598. shard_range.container)
  1599. shard_broker.merge_items(
  1600. [{'name': name, 'created_at': Timestamp.now().internal,
  1601. 'size': 0, 'content_type': 'text/plain',
  1602. 'etag': hashlib.md5().hexdigest(), 'deleted': deleted,
  1603. 'storage_policy_index': shard_broker.storage_policy_index}])
  1604. return shard_nodes[0]
  1605. all_obj_names = self._make_object_names(self.max_shard_size)
  1606. self.put_objects(all_obj_names)
  1607. # Shard the container
  1608. client.post_container(self.url, self.admin_token, self.container_name,
  1609. headers={'X-Container-Sharding': 'on'})
  1610. for n in self.brain.node_numbers:
  1611. self.sharders.once(
  1612. number=n, additional_args='--partitions=%s' % self.brain.part)
  1613. # sanity checks
  1614. for node in self.brain.nodes:
  1615. self.assert_container_state(node, 'sharded', 2)
  1616. self.assert_container_delete_fails()
  1617. self.assert_container_has_shard_sysmeta()
  1618. self.assert_container_post_ok('sharded')
  1619. self.assert_container_listing(all_obj_names)
  1620. # delete all objects in first shard range - updates redirected to shard
  1621. shard_ranges = self.get_container_shard_ranges()
  1622. self.assertLengthEqual(shard_ranges, 2)
  1623. shard_0_objects = [name for name in all_obj_names
  1624. if name in shard_ranges[0]]
  1625. shard_1_objects = [name for name in all_obj_names
  1626. if name in shard_ranges[1]]
  1627. self.delete_objects(shard_0_objects)
  1628. self.assert_container_listing(shard_1_objects)
  1629. self.assert_container_post_ok('has objects')
  1630. # run sharder on first shard container to update root stats
  1631. self.run_sharders(shard_ranges[0])
  1632. self.assert_container_object_count(len(shard_1_objects))
  1633. # First, test a misplaced object moving from one shard to another.
  1634. # run sharder on root to discover first shrink candidate
  1635. self.sharders.once(additional_args='--partitions=%s' % self.brain.part)
  1636. # then run sharder on first shard range to shrink it
  1637. self.run_sharders(shard_ranges[0])
  1638. # force a misplaced object into the shrunken shard range to simulate
  1639. # a client put that was in flight when it started to shrink
  1640. misplaced_node = merge_object(shard_ranges[0], 'alpha', deleted=0)
  1641. # root sees first shard has shrunk, only second shard range used for
  1642. # listing so alpha object not in listing
  1643. self.assertLengthEqual(self.get_container_shard_ranges(), 1)
  1644. self.assert_container_listing(shard_1_objects)
  1645. self.assert_container_object_count(len(shard_1_objects))
  1646. # until sharder runs on that node to move the misplaced object to the
  1647. # second shard range
  1648. shard_part, shard_nodes_numbers = self.get_part_and_node_numbers(
  1649. shard_ranges[0])
  1650. self.sharders.once(additional_args='--partitions=%s' % shard_part,
  1651. number=misplaced_node['id'] + 1)
  1652. self.assert_container_listing(['alpha'] + shard_1_objects)
  1653. # root not yet updated
  1654. self.assert_container_object_count(len(shard_1_objects))
  1655. # run sharder to get root updated
  1656. self.run_sharders(shard_ranges[1])
  1657. self.assert_container_listing(['alpha'] + shard_1_objects)
  1658. self.assert_container_object_count(len(shard_1_objects) + 1)
  1659. self.assertLengthEqual(self.get_container_shard_ranges(), 1)
  1660. # Now we have just one active shard, test a misplaced object moving
  1661. # from that shard to the root.
  1662. # delete most objects from second shard range and run sharder on root
  1663. # to discover second shrink candidate
  1664. self.delete_objects(shard_1_objects)
  1665. self.run_sharders(shard_ranges[1])
  1666. self.sharders.once(additional_args='--partitions=%s' % self.brain.part)
  1667. # then run sharder on the shard node to shrink it to root - note this
  1668. # moves alpha to the root db
  1669. self.run_sharders(shard_ranges[1])
  1670. # now there are no active shards
  1671. self.assertFalse(self.get_container_shard_ranges())
  1672. # force some misplaced object updates into second shrunk shard range
  1673. merge_object(shard_ranges[1], 'alpha', deleted=1)
  1674. misplaced_node = merge_object(shard_ranges[1], 'beta', deleted=0)
  1675. # root is not yet aware of them
  1676. self.assert_container_listing(['alpha'])
  1677. self.assert_container_object_count(1)
  1678. # until sharder runs on that node to move the misplaced object
  1679. shard_part, shard_nodes_numbers = self.get_part_and_node_numbers(
  1680. shard_ranges[1])
  1681. self.sharders.once(additional_args='--partitions=%s' % shard_part,
  1682. number=misplaced_node['id'] + 1)
  1683. self.assert_container_listing(['beta'])
  1684. self.assert_container_object_count(1)
  1685. self.assert_container_delete_fails()
  1686. def test_replication_to_sharded_container_from_unsharded_old_primary(self):
  1687. primary_ids = [n['id'] for n in self.brain.nodes]
  1688. handoff_node = next(n for n in self.brain.ring.devs
  1689. if n['id'] not in primary_ids)
  1690. # start with two sharded replicas and one unsharded with extra object
  1691. obj_names = self._setup_replication_scenario(2)
  1692. for node in self.brain.nodes[:2]:
  1693. self.assert_container_state(node, 'sharded', 2)
  1694. # Fake a ring change - copy unsharded db which has no shard ranges to a
  1695. # handoff to create illusion of a new unpopulated primary node
  1696. node_numbers = self.brain.node_numbers
  1697. new_primary_node = self.brain.nodes[2]
  1698. new_primary_node_number = node_numbers[2]
  1699. new_primary_dir, container_hash = self.get_storage_dir(
  1700. self.brain.part, new_primary_node)
  1701. old_primary_dir, container_hash = self.get_storage_dir(
  1702. self.brain.part, handoff_node)
  1703. utils.mkdirs(os.path.dirname(old_primary_dir))
  1704. shutil.move(new_primary_dir, old_primary_dir)
  1705. # make the cluster more or less "healthy" again
  1706. self.brain.servers.start(number=new_primary_node_number)
  1707. # get a db on every node...
  1708. client.put_container(self.url, self.token, self.container_name)
  1709. self.assertTrue(os.path.exists(os.path.join(
  1710. new_primary_dir, container_hash + '.db')))
  1711. found = self.categorize_container_dir_content()
  1712. self.assertLengthEqual(found['normal_dbs'], 1) # "new" primary
  1713. self.assertLengthEqual(found['shard_dbs'], 2) # existing primaries
  1714. # catastrophic failure! drive dies and is replaced on unchanged primary
  1715. failed_node = self.brain.nodes[0]
  1716. failed_dir, _container_hash = self.get_storage_dir(
  1717. self.brain.part, failed_node)
  1718. shutil.rmtree(failed_dir)
  1719. # replicate the "old primary" to everybody except the "new primary"
  1720. self.brain.servers.stop(number=new_primary_node_number)
  1721. self.replicators.once(number=handoff_node['id'] + 1)
  1722. # We're willing to rsync the retiring db to the failed primary.
  1723. # This may or may not have shard ranges, depending on the order in
  1724. # which we hit the primaries, but it definitely *doesn't* have an
  1725. # epoch in its name yet. All objects are replicated.
  1726. self.assertTrue(os.path.exists(os.path.join(
  1727. failed_dir, container_hash + '.db')))
  1728. self.assertLengthEqual(os.listdir(failed_dir), 1)
  1729. broker = self.get_broker(self.brain.part, failed_node)
  1730. self.assertLengthEqual(broker.get_objects(), len(obj_names) + 1)
  1731. # The other out-of-date primary is within usync range but objects are
  1732. # not replicated to it because the handoff db learns about shard ranges
  1733. broker = self.get_broker(self.brain.part, self.brain.nodes[1])
  1734. self.assertLengthEqual(broker.get_objects(), 0)
  1735. # Handoff db still exists and now has shard ranges!
  1736. self.assertTrue(os.path.exists(os.path.join(
  1737. old_primary_dir, container_hash + '.db')))
  1738. broker = self.get_broker(self.brain.part, handoff_node)
  1739. shard_ranges = broker.get_shard_ranges()
  1740. self.assertLengthEqual(shard_ranges, 2)
  1741. self.assert_container_state(handoff_node, 'unsharded', 2)
  1742. # Replicate again, this time *including* "new primary"
  1743. self.brain.servers.start(number=new_primary_node_number)
  1744. self.replicators.once(number=handoff_node['id'] + 1)
  1745. # Ordinarily, we would have rsync_then_merge'd to "new primary"
  1746. # but instead we wait
  1747. broker = self.get_broker(self.brain.part, new_primary_node)
  1748. self.assertLengthEqual(broker.get_objects(), 0)
  1749. shard_ranges = broker.get_shard_ranges()
  1750. self.assertLengthEqual(shard_ranges, 2)
  1751. # so the next time the sharder comes along, it can push rows out
  1752. # and delete the big db
  1753. self.sharders.once(number=handoff_node['id'] + 1,
  1754. additional_args='--partitions=%s' % self.brain.part)
  1755. self.assert_container_state(handoff_node, 'sharded', 2)
  1756. self.assertFalse(os.path.exists(os.path.join(
  1757. old_primary_dir, container_hash + '.db')))
  1758. # the sharded db hangs around until replication confirms durability
  1759. # first attempt is not sufficiently successful
  1760. self.brain.servers.stop(number=node_numbers[0])
  1761. self.replicators.once(number=handoff_node['id'] + 1)
  1762. self.assertTrue(os.path.exists(old_primary_dir))
  1763. self.assert_container_state(handoff_node, 'sharded', 2)
  1764. # second attempt is successful and handoff db is deleted
  1765. self.brain.servers.start(number=node_numbers[0])
  1766. self.replicators.once(number=handoff_node['id'] + 1)
  1767. self.assertFalse(os.path.exists(old_primary_dir))
  1768. # run all the sharders, get us into a consistent state
  1769. self.sharders.once(additional_args='--partitions=%s' % self.brain.part)
  1770. self.assert_container_listing(['alpha'] + obj_names)
  1771. def test_replication_to_empty_new_primary_from_sharding_old_primary(self):
  1772. primary_ids = [n['id'] for n in self.brain.nodes]
  1773. handoff_node = next(n for n in self.brain.ring.devs
  1774. if n['id'] not in primary_ids)
  1775. num_shards = 3
  1776. obj_names = self._make_object_names(
  1777. num_shards * self.max_shard_size // 2)
  1778. self.put_objects(obj_names)
  1779. client.post_container(self.url, self.admin_token, self.container_name,
  1780. headers={'X-Container-Sharding': 'on'})
  1781. # run replicators first time to get sync points set
  1782. self.replicators.once()
  1783. # start sharding on only the leader node
  1784. leader_node = self.brain.nodes[0]
  1785. leader_node_number = self.brain.node_numbers[0]
  1786. self.sharders.once(number=leader_node_number)
  1787. self.assert_container_state(leader_node, 'sharding', 3)
  1788. for node in self.brain.nodes[1:]:
  1789. self.assert_container_state(node, 'unsharded', 3)
  1790. # Fake a ring change - copy leader node db to a handoff to create
  1791. # illusion of a new unpopulated primary leader node
  1792. new_primary_dir, container_hash = self.get_storage_dir(
  1793. self.brain.part, leader_node)
  1794. old_primary_dir, container_hash = self.get_storage_dir(
  1795. self.brain.part, handoff_node)
  1796. utils.mkdirs(os.path.dirname(old_primary_dir))
  1797. shutil.move(new_primary_dir, old_primary_dir)
  1798. self.assert_container_state(handoff_node, 'sharding', 3)
  1799. # run replicator on handoff node to create a fresh db on new primary
  1800. self.assertFalse(os.path.exists(new_primary_dir))
  1801. self.replicators.once(number=handoff_node['id'] + 1)
  1802. self.assertTrue(os.path.exists(new_primary_dir))
  1803. self.assert_container_state(leader_node, 'sharded', 3)
  1804. broker = self.get_broker(self.brain.part, leader_node)
  1805. shard_ranges = broker.get_shard_ranges()
  1806. self.assertLengthEqual(shard_ranges, 3)
  1807. self.assertEqual(
  1808. [ShardRange.CLEAVED, ShardRange.CLEAVED, ShardRange.CREATED],
  1809. [sr.state for sr in shard_ranges])
  1810. # db still exists on handoff
  1811. self.assertTrue(os.path.exists(old_primary_dir))
  1812. self.assert_container_state(handoff_node, 'sharding', 3)
  1813. # continue sharding it...
  1814. self.sharders.once(number=handoff_node['id'] + 1)
  1815. self.assert_container_state(leader_node, 'sharded', 3)
  1816. # now handoff is fully sharded the replicator will delete it
  1817. self.replicators.once(number=handoff_node['id'] + 1)
  1818. self.assertFalse(os.path.exists(old_primary_dir))
  1819. # all primaries now have active shard ranges but only one is in sharded
  1820. # state
  1821. self.assert_container_state(leader_node, 'sharded', 3)
  1822. for node in self.brain.nodes[1:]:
  1823. self.assert_container_state(node, 'unsharded', 3)
  1824. node_data = self.direct_get_container_shard_ranges()
  1825. for node_id, (hdrs, shard_ranges) in node_data.items():
  1826. with annotate_failure(
  1827. 'node id %s from %s' % (node_id, node_data.keys)):
  1828. self.assert_shard_range_state(ShardRange.ACTIVE, shard_ranges)
  1829. # check handoff cleaved all objects before it was deleted - stop all
  1830. # but leader node so that listing is fetched from shards
  1831. for number in self.brain.node_numbers[1:3]:
  1832. self.brain.servers.stop(number=number)
  1833. self.assert_container_listing(obj_names)
  1834. for number in self.brain.node_numbers[1:3]:
  1835. self.brain.servers.start(number=number)
  1836. self.sharders.once()
  1837. self.assert_container_state(leader_node, 'sharded', 3)
  1838. for node in self.brain.nodes[1:]:
  1839. self.assert_container_state(node, 'sharding', 3)
  1840. self.sharders.once()
  1841. for node in self.brain.nodes:
  1842. self.assert_container_state(node, 'sharded', 3)
  1843. self.assert_container_listing(obj_names)
  1844. def test_sharded_account_updates(self):
  1845. # verify that .shards account updates have zero object count and bytes
  1846. # to avoid double accounting
  1847. all_obj_names = self._make_object_names(self.max_shard_size)
  1848. self.put_objects(all_obj_names, contents='xyz')
  1849. # Shard the container into 2 shards
  1850. client.post_container(self.url, self.admin_token, self.container_name,
  1851. headers={'X-Container-Sharding': 'on'})
  1852. for n in self.brain.node_numbers:
  1853. self.sharders.once(
  1854. number=n, additional_args='--partitions=%s' % self.brain.part)
  1855. # sanity checks
  1856. for node in self.brain.nodes:
  1857. shard_ranges = self.assert_container_state(node, 'sharded', 2)
  1858. self.assert_container_delete_fails()
  1859. self.assert_container_has_shard_sysmeta()
  1860. self.assert_container_post_ok('sharded')
  1861. self.assert_container_listing(all_obj_names)
  1862. # run the updaters to get account stats updated
  1863. self.updaters.once()
  1864. # check user account stats
  1865. metadata = self.internal_client.get_account_metadata(self.account)
  1866. self.assertEqual(1, int(metadata.get('x-account-container-count')))
  1867. self.assertEqual(self.max_shard_size,
  1868. int(metadata.get('x-account-object-count')))
  1869. self.assertEqual(3 * self.max_shard_size,
  1870. int(metadata.get('x-account-bytes-used')))
  1871. # check hidden .shards account stats
  1872. metadata = self.internal_client.get_account_metadata(
  1873. shard_ranges[0].account)
  1874. self.assertEqual(2, int(metadata.get('x-account-container-count')))
  1875. self.assertEqual(0, int(metadata.get('x-account-object-count')))
  1876. self.assertEqual(0, int(metadata.get('x-account-bytes-used')))