Deterministic, repeatable serialization for rings.

The (account|container|object).ring.gz files contain, among other
things, a JSON-encoded dictionary. This change simply makes the JSON
serializer sort the keys of that dictionary so that two
Python-identical rings will result in two bytewise-identical ring
files. Also, to get repeatable compression, we lock down the timestamp
in the gzip output stream to a fixed value. (There's a timestamp value
in a gzip stream header; by default, gzip.GzipFile sticks time.time()
in there.)

This only works on Python 2.7; on 2.6, the 'mtime' argument to
gzip.GzipFile() is unsupported. Don't worry, serialization still works
on 2.6. It just doesn't always produce the same bytes for the same
ring.

Change-Id: Ide446413d0aeb78536883933fd0caf440b8f54ad
This commit is contained in:
Samuel Merritt 2013-01-31 15:12:09 -08:00
parent 85529531d6
commit 156cdc8edf
2 changed files with 36 additions and 2 deletions
swift/common/ring
test/unit/common/ring

@ -85,7 +85,8 @@ class RingData(object):
# Write out new-style serialization magic and version:
file_obj.write(struct.pack('!4sH', 'R1NG', 1))
ring = self.to_dict()
json_text = json.dumps(
json_encoder = json.JSONEncoder(sort_keys=True)
json_text = json_encoder.encode(
{'devs': ring['devs'], 'part_shift': ring['part_shift'],
'replica_count': len(ring['replica2part2dev_id'])})
json_len = len(json_text)
@ -100,7 +101,16 @@ class RingData(object):
:param filename: File into which this instance should be serialized.
"""
gz_file = GzipFile(filename, 'wb')
# Override the timestamp so that the same ring data creates
# the same bytes on disk. This makes a checksum comparison a
# good way to see if two rings are identical.
#
# This only works on Python 2.7; on 2.6, we always get the
# current time in the gzip output.
try:
gz_file = GzipFile(filename, 'wb', mtime=1300507380.0)
except TypeError:
gz_file = GzipFile(filename, 'wb')
self.serialize_v1(gz_file)
gz_file.close()

@ -16,6 +16,7 @@
import array
import cPickle as pickle
import os
import sys
import unittest
from gzip import GzipFile
from shutil import rmtree
@ -67,6 +68,29 @@ class TestRingData(unittest.TestCase):
rd2 = ring.RingData.load(ring_fname)
self.assert_ring_data_equal(rd, rd2)
def test_deterministic_serialization(self):
"""
Two identical rings should produce identical .gz files on disk.
Only true on Python 2.7 or greater.
"""
if sys.version_info[0] == 2 and sys.version_info[1] < 7:
return
os.mkdir(os.path.join(self.testdir, '1'))
os.mkdir(os.path.join(self.testdir, '2'))
# These have to have the same filename (not full path,
# obviously) since the filename gets encoded in the gzip data.
ring_fname1 = os.path.join(self.testdir, '1', 'the.ring.gz')
ring_fname2 = os.path.join(self.testdir, '2', 'the.ring.gz')
rd = ring.RingData(
[array.array('H', [0, 1, 0, 1]), array.array('H',[0, 1, 0, 1])],
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30)
rd.save(ring_fname1)
rd.save(ring_fname2)
with open(ring_fname1) as ring1:
with open(ring_fname2) as ring2:
self.assertEqual(ring1.read(), ring2.read())
class TestRing(unittest.TestCase):