Reject object names with Unicode surrogates

Technically, you can't encode surrogates into UTF-8 at all, but Python
2 lets you get away with it. Python 3 does not.

We already have a check for surrogate pairs (commit 0080337), but not
one for lone surrogates. This commit forbids object names with lone
surrogates in them.

The problem with surrogates is trivially reproducible:

    swift@saio:~$ python2.7
    Python 2.7.3 (default, Feb 27 2014, 19:58:35)
    [GCC 4.6.3] on linux2
    Type "help", "copyright", "credits" or "license" for more information.
    >>> b'\xed\xa0\xbc'.decode('utf-8')
    u'\ud83c'
    >>>

    swift@saio:~$ python3.3
    Python 3.3.5 (default, Aug  4 2014, 15:27:24)
    [GCC 4.6.3] on linux
    Type "help", "copyright", "credits" or "license" for more information.
    >>> b'\xed\xa0\xbc'.decode('utf-8')
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
    UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0: invalid continuation byte
    >>>

See also http://bugs.python.org/issue9133

Change-Id: I7c31022e8a028c3cdf2ed1586349509d96cfded9
This commit is contained in:
Samuel Merritt 2014-11-07 13:53:46 -08:00
parent 772dc5d059
commit 331b14238e
2 changed files with 10 additions and 1 deletions

View File

@ -306,7 +306,12 @@ def check_utf8(string):
if isinstance(string, unicode):
string.encode('utf-8')
else:
if string.decode('UTF-8').encode('UTF-8') != string:
decoded = string.decode('UTF-8')
if decoded.encode('UTF-8') != string:
return False
# A UTF-8 string with surrogates in it is invalid.
if any(0xD800 <= ord(codepoint) <= 0xDFFF
for codepoint in decoded):
return False
return '\x00' not in string
# If string is unicode, decode() will raise UnicodeEncodeError

View File

@ -420,6 +420,10 @@ class TestConstraints(unittest.TestCase):
self.assertFalse(constraints.check_utf8('\xed\xa0\xbc\xed\xbc\xb8'))
self.assertFalse(constraints.check_utf8('\xed\xa0\xbd\xed\xb9\x88'))
def test_check_utf8_lone_surrogates(self):
self.assertFalse(constraints.check_utf8('\xed\xa0\xbc'))
self.assertFalse(constraints.check_utf8('\xed\xb9\x88'))
def test_validate_bad_meta(self):
req = Request.blank(
'/v/a/c/o',