Merge "Fix OIDC federation UTF-8 double-encoding of non-ASCII characters"

This commit is contained in:
Zuul
2026-03-31 09:47:37 +00:00
committed by Gerrit Code Review
3 changed files with 112 additions and 5 deletions

View File

@@ -444,11 +444,25 @@ def get_assertion_params_from_env():
for k, v in list(flask.request.environ.items()):
if not k.startswith(prefix):
continue
# These bytes may be decodable as ISO-8859-1 according to Section
# 3.2.4 of RFC 7230. Let's assume that our web server plugins are
# correctly encoding the data.
if not isinstance(v, str) and getattr(v, 'decode', False):
v = v.decode('ISO-8859-1')
if isinstance(v, str):
# Per Section 3.2.4 of RFC 7230, HTTP header field values use
# ISO-8859-1 encoding, and PEP 3333 requires WSGI environ
# values to be native strings decoded as Latin-1 accordingly.
# However, OIDC IdPs commonly send assertion values encoded as
# UTF-8 (e.g. non-ASCII characters like 'ñ' or 'å'). When
# mod_wsgi decodes those UTF-8 bytes as Latin-1, the result is
# mojibake. We reverse the Latin-1 decode and re-decode as
# UTF-8 to recover the original text. If that fails, the value
# was legitimately Latin-1 and is kept as-is.
try:
v = v.encode('ISO-8859-1').decode('utf-8')
except (UnicodeDecodeError, UnicodeEncodeError):
pass
elif getattr(v, 'decode', False):
try:
v = v.decode('utf-8')
except UnicodeDecodeError:
v = v.decode('ISO-8859-1')
yield (k, v)

View File

@@ -997,6 +997,77 @@ class TestUnicodeAssertionData(unit.BaseTestCase):
self.assertEqual(full_name, user_name)
class TestWsgiUtf8LatinRoundtrip(unit.BaseTestCase):
"""Test that UTF-8 data mangled by WSGI Latin-1 decoding is recovered.
PEP 3333 requires WSGI environ values to be native strings decoded as
Latin-1. When an IdP sends UTF-8 encoded non-ASCII characters (e.g.
Spanish 'ñ' or Scandinavian 'å'), mod_wsgi decodes the raw UTF-8 bytes
as Latin-1, producing mojibake. The get_assertion_params_from_env()
function must reverse this by encoding back to Latin-1 and re-decoding
as UTF-8.
"""
def setUp(self):
super().setUp()
self.config_fixture = self.useFixture(config_fixture.Config(CONF))
self.config_fixture.config(group='federation', assertion_prefix='PFX')
def _get_assertion_via_wsgi(self, environ_overrides):
app = flask.Flask(__name__)
with app.test_request_context(
path='/path', environ_overrides=environ_overrides
):
return dict(mapping_utils.get_assertion_params_from_env())
def test_utf8_latin1_roundtrip_recovers_unicode(self):
"""Verify that double-encoded UTF-8 assertion values are recovered."""
assertion = self._get_assertion_via_wsgi(
mapping_fixtures.WSGI_LATIN1_UTF8_ASSERTION
)
self.assertEqual('Jon Kåre', assertion['PFX_FirstName'])
self.assertEqual('Hellån', assertion['PFX_LastName'])
def test_already_correct_unicode_is_preserved(self):
"""Verify that properly decoded Unicode values are not corrupted."""
assertion = self._get_assertion_via_wsgi(
mapping_fixtures.UNICODE_NAME_ASSERTION
)
self.assertEqual('Jon Kåre', assertion['PFX_FirstName'])
self.assertEqual('Hellån', assertion['PFX_LastName'])
def test_ascii_values_are_unaffected(self):
"""Verify that pure ASCII values pass through unchanged."""
assertion = self._get_assertion_via_wsgi(
mapping_fixtures.UNICODE_NAME_ASSERTION
)
self.assertEqual('jon@example.com', assertion['PFX_Email'])
self.assertEqual('jonkare', assertion['PFX_UserName'])
def test_oidc_groups_with_special_chars(self):
"""Verify OIDC groups containing 'ñ' are correctly decoded."""
self.config_fixture.config(group='federation', assertion_prefix='OIDC')
assertion = self._get_assertion_via_wsgi(
mapping_fixtures.WSGI_LATIN1_UTF8_GROUPS_ASSERTION
)
groups_value = assertion['OIDC-groups']
self.assertIn('España', groups_value)
self.assertNotIn('\u00c3', groups_value)
def test_bytes_value_utf8(self):
"""Verify that bytes values are decoded as UTF-8."""
environ = {'PFX_Name': 'Espa\u00f1a'.encode()}
assertion = self._get_assertion_via_wsgi(environ)
self.assertEqual('España', assertion['PFX_Name'])
def test_bytes_value_latin1_fallback(self):
"""Verify that non-UTF-8 bytes fall back to ISO-8859-1 decoding."""
environ = {'PFX_Name': 'Espa\u00f1a'.encode('ISO-8859-1')}
assertion = self._get_assertion_via_wsgi(environ)
self.assertEqual('España', assertion['PFX_Name'])
class TestMappingLocals(unit.BaseTestCase):
mapping_split = {
'rules': [

View File

@@ -1045,6 +1045,28 @@ UNICODE_NAME_ASSERTION = {
'PFX_orgPersonType': 'Admin;Chief',
}
# Simulates what mod_wsgi does to UTF-8 data per PEP 3333: the raw UTF-8
# bytes are decoded as Latin-1, producing mojibake. For example, 'ñ' (UTF-8:
# \xc3\xb1) becomes 'ñ' (Latin-1 interpretation of those two bytes).
WSGI_LATIN1_UTF8_ASSERTION = {
'PFX_Email': 'jon@example.com',
'PFX_UserName': 'jonkare',
'PFX_FirstName': 'Jon K\u00c3\u00a5re', # 'Jon Kåre' double-encoded
'PFX_LastName': 'Hell\u00c3\u00a5n', # 'Hellån' double-encoded
'PFX_orgPersonType': 'Admin;Chief',
}
# Simulates OIDC groups assertion with non-ASCII characters (e.g. Spanish ñ)
# arriving through WSGI with Latin-1 decoding of UTF-8 bytes.
WSGI_LATIN1_UTF8_GROUPS_ASSERTION = {
'OIDC-upn': 'user@example.com',
'OIDC-groups': (
'Team_Espa\u00c3\u00b1a_1401_power_user' # España double-encoded
';federation-tests_power_user'
),
}
GROUPS_ASSERTION_ONLY_ONE_GROUP = {
'userEmail': 'jill@example.com',
'UserName': 'jsmith',