| From 0250d40aa3e7f471bc7cdecccd956e378eb65eb0 Mon Sep 17 00:00:00 2001 |
| From: Victor Stinner <vstinner@python.org> |
| Date: Fri, 15 Dec 2023 16:10:40 +0100 |
| Subject: [PATCH] [3.8] [CVE-2023-27043] gh-102988: Reject malformed addresses |
| in email.parseaddr() (GH-111116) |
| |
| Detect email address parsing errors and return empty tuple to |
| indicate the parsing error (old API). Add an optional 'strict' |
| parameter to getaddresses() and parseaddr() functions. Patch by |
| Thomas Dwyer. |
| |
| (cherry picked from commit 4a153a1d3b18803a684cd1bcc2cdf3ede3dbae19) |
| |
| Co-authored-by: Victor Stinner <vstinner@python.org> |
| Co-Authored-By: Thomas Dwyer <github@tomd.tel> |
| --- |
| Doc/library/email.utils.rst | 19 +- |
| Lib/email/utils.py | 151 ++++++++++++- |
| Lib/test/test_email/test_email.py | 204 +++++++++++++++++- |
| ...-10-20-15-28-08.gh-issue-102988.dStNO7.rst | 8 + |
| 4 files changed, 361 insertions(+), 21 deletions(-) |
| create mode 100644 Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst |
| |
| diff --git a/Doc/library/email.utils.rst b/Doc/library/email.utils.rst |
| index 4d0e920eb0ad29..104229e9e55af2 100644 |
| --- a/Doc/library/email.utils.rst |
| +++ b/Doc/library/email.utils.rst |
| @@ -60,13 +60,18 @@ of the new API. |
| begins with angle brackets, they are stripped off. |
| |
| |
| -.. function:: parseaddr(address) |
| +.. function:: parseaddr(address, *, strict=True) |
| |
| Parse address -- which should be the value of some address-containing field such |
| as :mailheader:`To` or :mailheader:`Cc` -- into its constituent *realname* and |
| *email address* parts. Returns a tuple of that information, unless the parse |
| fails, in which case a 2-tuple of ``('', '')`` is returned. |
| |
| + If *strict* is true, use a strict parser which rejects malformed inputs. |
| + |
| + .. versionchanged:: 3.8.19 |
| + Add *strict* optional parameter and reject malformed inputs by default. |
| + |
| |
| .. function:: formataddr(pair, charset='utf-8') |
| |
| @@ -84,12 +89,15 @@ of the new API. |
| Added the *charset* option. |
| |
| |
| -.. function:: getaddresses(fieldvalues) |
| +.. function:: getaddresses(fieldvalues, *, strict=True) |
| |
| This method returns a list of 2-tuples of the form returned by ``parseaddr()``. |
| *fieldvalues* is a sequence of header field values as might be returned by |
| - :meth:`Message.get_all <email.message.Message.get_all>`. Here's a simple |
| - example that gets all the recipients of a message:: |
| + :meth:`Message.get_all <email.message.Message.get_all>`. |
| + |
| + If *strict* is true, use a strict parser which rejects malformed inputs. |
| + |
| + Here's a simple example that gets all the recipients of a message:: |
| |
| from email.utils import getaddresses |
| |
| @@ -99,6 +107,9 @@ of the new API. |
| resent_ccs = msg.get_all('resent-cc', []) |
| all_recipients = getaddresses(tos + ccs + resent_tos + resent_ccs) |
| |
| + .. versionchanged:: 3.8.19 |
| + Add *strict* optional parameter and reject malformed inputs by default. |
| + |
| |
| .. function:: parsedate(date) |
| |
| diff --git a/Lib/email/utils.py b/Lib/email/utils.py |
| index 07dd029cc02800..99773d0b930ad3 100644 |
| --- a/Lib/email/utils.py |
| +++ b/Lib/email/utils.py |
| @@ -48,6 +48,7 @@ |
| specialsre = re.compile(r'[][\\()<>@,:;".]') |
| escapesre = re.compile(r'[\\"]') |
| |
| + |
| def _has_surrogates(s): |
| """Return True if s contains surrogate-escaped binary data.""" |
| # This check is based on the fact that unless there are surrogates, utf8 |
| @@ -106,12 +107,127 @@ def formataddr(pair, charset='utf-8'): |
| return address |
| |
| |
| +def _iter_escaped_chars(addr): |
| + pos = 0 |
| + escape = False |
| + for pos, ch in enumerate(addr): |
| + if escape: |
| + yield (pos, '\\' + ch) |
| + escape = False |
| + elif ch == '\\': |
| + escape = True |
| + else: |
| + yield (pos, ch) |
| + if escape: |
| + yield (pos, '\\') |
| + |
| + |
| +def _strip_quoted_realnames(addr): |
| + """Strip real names between quotes.""" |
| + if '"' not in addr: |
| + # Fast path |
| + return addr |
| + |
| + start = 0 |
| + open_pos = None |
| + result = [] |
| + for pos, ch in _iter_escaped_chars(addr): |
| + if ch == '"': |
| + if open_pos is None: |
| + open_pos = pos |
| + else: |
| + if start != open_pos: |
| + result.append(addr[start:open_pos]) |
| + start = pos + 1 |
| + open_pos = None |
| + |
| + if start < len(addr): |
| + result.append(addr[start:]) |
| + |
| + return ''.join(result) |
| |
| -def getaddresses(fieldvalues): |
| - """Return a list of (REALNAME, EMAIL) for each fieldvalue.""" |
| - all = COMMASPACE.join(fieldvalues) |
| - a = _AddressList(all) |
| - return a.addresslist |
| + |
| +supports_strict_parsing = True |
| + |
| +def getaddresses(fieldvalues, *, strict=True): |
| + """Return a list of (REALNAME, EMAIL) or ('','') for each fieldvalue. |
| + |
| + When parsing fails for a fieldvalue, a 2-tuple of ('', '') is returned in |
| + its place. |
| + |
| + If strict is true, use a strict parser which rejects malformed inputs. |
| + """ |
| + |
| + # If strict is true, if the resulting list of parsed addresses is greater |
| + # than the number of fieldvalues in the input list, a parsing error has |
| + # occurred and consequently a list containing a single empty 2-tuple [('', |
| + # '')] is returned in its place. This is done to avoid invalid output. |
| + # |
| + # Malformed input: getaddresses(['alice@example.com <bob@example.com>']) |
| + # Invalid output: [('', 'alice@example.com'), ('', 'bob@example.com')] |
| + # Safe output: [('', '')] |
| + |
| + if not strict: |
| + all = COMMASPACE.join(str(v) for v in fieldvalues) |
| + a = _AddressList(all) |
| + return a.addresslist |
| + |
| + fieldvalues = [str(v) for v in fieldvalues] |
| + fieldvalues = _pre_parse_validation(fieldvalues) |
| + addr = COMMASPACE.join(fieldvalues) |
| + a = _AddressList(addr) |
| + result = _post_parse_validation(a.addresslist) |
| + |
| + # Treat output as invalid if the number of addresses is not equal to the |
| + # expected number of addresses. |
| + n = 0 |
| + for v in fieldvalues: |
| + # When a comma is used in the Real Name part it is not a deliminator. |
| + # So strip those out before counting the commas. |
| + v = _strip_quoted_realnames(v) |
| + # Expected number of addresses: 1 + number of commas |
| + n += 1 + v.count(',') |
| + if len(result) != n: |
| + return [('', '')] |
| + |
| + return result |
| + |
| + |
| +def _check_parenthesis(addr): |
| + # Ignore parenthesis in quoted real names. |
| + addr = _strip_quoted_realnames(addr) |
| + |
| + opens = 0 |
| + for pos, ch in _iter_escaped_chars(addr): |
| + if ch == '(': |
| + opens += 1 |
| + elif ch == ')': |
| + opens -= 1 |
| + if opens < 0: |
| + return False |
| + return (opens == 0) |
| + |
| + |
| +def _pre_parse_validation(email_header_fields): |
| + accepted_values = [] |
| + for v in email_header_fields: |
| + if not _check_parenthesis(v): |
| + v = "('', '')" |
| + accepted_values.append(v) |
| + |
| + return accepted_values |
| + |
| + |
| +def _post_parse_validation(parsed_email_header_tuples): |
| + accepted_values = [] |
| + # The parser would have parsed a correctly formatted domain-literal |
| + # The existence of an [ after parsing indicates a parsing failure |
| + for v in parsed_email_header_tuples: |
| + if '[' in v[1]: |
| + v = ('', '') |
| + accepted_values.append(v) |
| + |
| + return accepted_values |
| |
| |
| def _format_timetuple_and_zone(timetuple, zone): |
| @@ -202,16 +318,33 @@ def parsedate_to_datetime(data): |
| tzinfo=datetime.timezone(datetime.timedelta(seconds=tz))) |
| |
| |
| -def parseaddr(addr): |
| +def parseaddr(addr, *, strict=True): |
| """ |
| Parse addr into its constituent realname and email address parts. |
| |
| Return a tuple of realname and email address, unless the parse fails, in |
| which case return a 2-tuple of ('', ''). |
| + |
| + If strict is True, use a strict parser which rejects malformed inputs. |
| """ |
| - addrs = _AddressList(addr).addresslist |
| - if not addrs: |
| - return '', '' |
| + if not strict: |
| + addrs = _AddressList(addr).addresslist |
| + if not addrs: |
| + return ('', '') |
| + return addrs[0] |
| + |
| + if isinstance(addr, list): |
| + addr = addr[0] |
| + |
| + if not isinstance(addr, str): |
| + return ('', '') |
| + |
| + addr = _pre_parse_validation([addr])[0] |
| + addrs = _post_parse_validation(_AddressList(addr).addresslist) |
| + |
| + if not addrs or len(addrs) > 1: |
| + return ('', '') |
| + |
| return addrs[0] |
| |
| |
| diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py |
| index d68de61bb696b9..ee9a9645f55421 100644 |
| --- a/Lib/test/test_email/test_email.py |
| +++ b/Lib/test/test_email/test_email.py |
| @@ -16,6 +16,7 @@ |
| |
| import email |
| import email.policy |
| +import email.utils |
| |
| from email.charset import Charset |
| from email.header import Header, decode_header, make_header |
| @@ -3248,15 +3249,154 @@ def test_getaddresses(self): |
| [('Al Person', 'aperson@dom.ain'), |
| ('Bud Person', 'bperson@dom.ain')]) |
| |
| + def test_getaddresses_comma_in_name(self): |
| + """GH-106669 regression test.""" |
| + self.assertEqual( |
| + utils.getaddresses( |
| + [ |
| + '"Bud, Person" <bperson@dom.ain>', |
| + 'aperson@dom.ain (Al Person)', |
| + '"Mariusz Felisiak" <to@example.com>', |
| + ] |
| + ), |
| + [ |
| + ('Bud, Person', 'bperson@dom.ain'), |
| + ('Al Person', 'aperson@dom.ain'), |
| + ('Mariusz Felisiak', 'to@example.com'), |
| + ], |
| + ) |
| + |
| + def test_parsing_errors(self): |
| + """Test for parsing errors from CVE-2023-27043 and CVE-2019-16056""" |
| + alice = 'alice@example.org' |
| + bob = 'bob@example.com' |
| + empty = ('', '') |
| + |
| + # Test utils.getaddresses() and utils.parseaddr() on malformed email |
| + # addresses: default behavior (strict=True) rejects malformed address, |
| + # and strict=False which tolerates malformed address. |
| + for invalid_separator, expected_non_strict in ( |
| + ('(', [(f'<{bob}>', alice)]), |
| + (')', [('', alice), empty, ('', bob)]), |
| + ('<', [('', alice), empty, ('', bob), empty]), |
| + ('>', [('', alice), empty, ('', bob)]), |
| + ('[', [('', f'{alice}[<{bob}>]')]), |
| + (']', [('', alice), empty, ('', bob)]), |
| + ('@', [empty, empty, ('', bob)]), |
| + (';', [('', alice), empty, ('', bob)]), |
| + (':', [('', alice), ('', bob)]), |
| + ('.', [('', alice + '.'), ('', bob)]), |
| + ('"', [('', alice), ('', f'<{bob}>')]), |
| + ): |
| + address = f'{alice}{invalid_separator}<{bob}>' |
| + with self.subTest(address=address): |
| + self.assertEqual(utils.getaddresses([address]), |
| + [empty]) |
| + self.assertEqual(utils.getaddresses([address], strict=False), |
| + expected_non_strict) |
| + |
| + self.assertEqual(utils.parseaddr([address]), |
| + empty) |
| + self.assertEqual(utils.parseaddr([address], strict=False), |
| + ('', address)) |
| + |
| + # Comma (',') is treated differently depending on strict parameter. |
| + # Comma without quotes. |
| + address = f'{alice},<{bob}>' |
| + self.assertEqual(utils.getaddresses([address]), |
| + [('', alice), ('', bob)]) |
| + self.assertEqual(utils.getaddresses([address], strict=False), |
| + [('', alice), ('', bob)]) |
| + self.assertEqual(utils.parseaddr([address]), |
| + empty) |
| + self.assertEqual(utils.parseaddr([address], strict=False), |
| + ('', address)) |
| + |
| + # Real name between quotes containing comma. |
| + address = '"Alice, alice@example.org" <bob@example.com>' |
| + expected_strict = ('Alice, alice@example.org', 'bob@example.com') |
| + self.assertEqual(utils.getaddresses([address]), [expected_strict]) |
| + self.assertEqual(utils.getaddresses([address], strict=False), [expected_strict]) |
| + self.assertEqual(utils.parseaddr([address]), expected_strict) |
| + self.assertEqual(utils.parseaddr([address], strict=False), |
| + ('', address)) |
| + |
| + # Valid parenthesis in comments. |
| + address = 'alice@example.org (Alice)' |
| + expected_strict = ('Alice', 'alice@example.org') |
| + self.assertEqual(utils.getaddresses([address]), [expected_strict]) |
| + self.assertEqual(utils.getaddresses([address], strict=False), [expected_strict]) |
| + self.assertEqual(utils.parseaddr([address]), expected_strict) |
| + self.assertEqual(utils.parseaddr([address], strict=False), |
| + ('', address)) |
| + |
| + # Invalid parenthesis in comments. |
| + address = 'alice@example.org )Alice(' |
| + self.assertEqual(utils.getaddresses([address]), [empty]) |
| + self.assertEqual(utils.getaddresses([address], strict=False), |
| + [('', 'alice@example.org'), ('', ''), ('', 'Alice')]) |
| + self.assertEqual(utils.parseaddr([address]), empty) |
| + self.assertEqual(utils.parseaddr([address], strict=False), |
| + ('', address)) |
| + |
| + # Two addresses with quotes separated by comma. |
| + address = '"Jane Doe" <jane@example.net>, "John Doe" <john@example.net>' |
| + self.assertEqual(utils.getaddresses([address]), |
| + [('Jane Doe', 'jane@example.net'), |
| + ('John Doe', 'john@example.net')]) |
| + self.assertEqual(utils.getaddresses([address], strict=False), |
| + [('Jane Doe', 'jane@example.net'), |
| + ('John Doe', 'john@example.net')]) |
| + self.assertEqual(utils.parseaddr([address]), empty) |
| + self.assertEqual(utils.parseaddr([address], strict=False), |
| + ('', address)) |
| + |
| + # Test email.utils.supports_strict_parsing attribute |
| + self.assertEqual(email.utils.supports_strict_parsing, True) |
| + |
| def test_getaddresses_nasty(self): |
| - eq = self.assertEqual |
| - eq(utils.getaddresses(['foo: ;']), [('', '')]) |
| - eq(utils.getaddresses( |
| - ['[]*-- =~$']), |
| - [('', ''), ('', ''), ('', '*--')]) |
| - eq(utils.getaddresses( |
| - ['foo: ;', '"Jason R. Mastaler" <jason@dom.ain>']), |
| - [('', ''), ('Jason R. Mastaler', 'jason@dom.ain')]) |
| + for addresses, expected in ( |
| + (['"Sürname, Firstname" <to@example.com>'], |
| + [('Sürname, Firstname', 'to@example.com')]), |
| + |
| + (['foo: ;'], |
| + [('', '')]), |
| + |
| + (['foo: ;', '"Jason R. Mastaler" <jason@dom.ain>'], |
| + [('', ''), ('Jason R. Mastaler', 'jason@dom.ain')]), |
| + |
| + ([r'Pete(A nice \) chap) <pete(his account)@silly.test(his host)>'], |
| + [('Pete (A nice ) chap his account his host)', 'pete@silly.test')]), |
| + |
| + (['(Empty list)(start)Undisclosed recipients :(nobody(I know))'], |
| + [('', '')]), |
| + |
| + (['Mary <@machine.tld:mary@example.net>, , jdoe@test . example'], |
| + [('Mary', 'mary@example.net'), ('', ''), ('', 'jdoe@test.example')]), |
| + |
| + (['John Doe <jdoe@machine(comment). example>'], |
| + [('John Doe (comment)', 'jdoe@machine.example')]), |
| + |
| + (['"Mary Smith: Personal Account" <smith@home.example>'], |
| + [('Mary Smith: Personal Account', 'smith@home.example')]), |
| + |
| + (['Undisclosed recipients:;'], |
| + [('', '')]), |
| + |
| + ([r'<boss@nil.test>, "Giant; \"Big\" Box" <bob@example.net>'], |
| + [('', 'boss@nil.test'), ('Giant; "Big" Box', 'bob@example.net')]), |
| + ): |
| + with self.subTest(addresses=addresses): |
| + self.assertEqual(utils.getaddresses(addresses), |
| + expected) |
| + self.assertEqual(utils.getaddresses(addresses, strict=False), |
| + expected) |
| + |
| + addresses = ['[]*-- =~$'] |
| + self.assertEqual(utils.getaddresses(addresses), |
| + [('', '')]) |
| + self.assertEqual(utils.getaddresses(addresses, strict=False), |
| + [('', ''), ('', ''), ('', '*--')]) |
| |
| def test_getaddresses_embedded_comment(self): |
| """Test proper handling of a nested comment""" |
| @@ -3440,6 +3580,54 @@ def test_mime_classes_policy_argument(self): |
| m = cls(*constructor, policy=email.policy.default) |
| self.assertIs(m.policy, email.policy.default) |
| |
| + def test_iter_escaped_chars(self): |
| + self.assertEqual(list(utils._iter_escaped_chars(r'a\\b\"c\\"d')), |
| + [(0, 'a'), |
| + (2, '\\\\'), |
| + (3, 'b'), |
| + (5, '\\"'), |
| + (6, 'c'), |
| + (8, '\\\\'), |
| + (9, '"'), |
| + (10, 'd')]) |
| + self.assertEqual(list(utils._iter_escaped_chars('a\\')), |
| + [(0, 'a'), (1, '\\')]) |
| + |
| + def test_strip_quoted_realnames(self): |
| + def check(addr, expected): |
| + self.assertEqual(utils._strip_quoted_realnames(addr), expected) |
| + |
| + check('"Jane Doe" <jane@example.net>, "John Doe" <john@example.net>', |
| + ' <jane@example.net>, <john@example.net>') |
| + check(r'"Jane \"Doe\"." <jane@example.net>', |
| + ' <jane@example.net>') |
| + |
| + # special cases |
| + check(r'before"name"after', 'beforeafter') |
| + check(r'before"name"', 'before') |
| + check(r'b"name"', 'b') # single char |
| + check(r'"name"after', 'after') |
| + check(r'"name"a', 'a') # single char |
| + check(r'"name"', '') |
| + |
| + # no change |
| + for addr in ( |
| + 'Jane Doe <jane@example.net>, John Doe <john@example.net>', |
| + 'lone " quote', |
| + ): |
| + self.assertEqual(utils._strip_quoted_realnames(addr), addr) |
| + |
| + |
| + def test_check_parenthesis(self): |
| + addr = 'alice@example.net' |
| + self.assertTrue(utils._check_parenthesis(f'{addr} (Alice)')) |
| + self.assertFalse(utils._check_parenthesis(f'{addr} )Alice(')) |
| + self.assertFalse(utils._check_parenthesis(f'{addr} (Alice))')) |
| + self.assertFalse(utils._check_parenthesis(f'{addr} ((Alice)')) |
| + |
| + # Ignore real name between quotes |
| + self.assertTrue(utils._check_parenthesis(f'")Alice((" {addr}')) |
| + |
| |
| # Test the iterator/generators |
| class TestIterators(TestEmailBase): |
| diff --git a/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst b/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst |
| new file mode 100644 |
| index 00000000000000..3d0e9e4078c934 |
| --- /dev/null |
| +++ b/Misc/NEWS.d/next/Library/2023-10-20-15-28-08.gh-issue-102988.dStNO7.rst |
| @@ -0,0 +1,8 @@ |
| +:func:`email.utils.getaddresses` and :func:`email.utils.parseaddr` now |
| +return ``('', '')`` 2-tuples in more situations where invalid email |
| +addresses are encountered instead of potentially inaccurate values. Add |
| +optional *strict* parameter to these two functions: use ``strict=False`` to |
| +get the old behavior, accept malformed inputs. |
| +``getattr(email.utils, 'supports_strict_parsing', False)`` can be use to check |
| +if the *strict* paramater is available. Patch by Thomas Dwyer and Victor |
| +Stinner to improve the CVE-2023-27043 fix. |