| From 93020266b7049789fc52f7450ee7ec48385f50a0 Mon Sep 17 00:00:00 2001 |
| From: "Miss Islington (bot)" |
| <31488909+miss-islington@users.noreply.github.com> |
| Date: Mon, 22 May 2023 03:42:37 -0700 |
| Subject: [PATCH] [3.8] gh-102153: Start stripping C0 control and space chars |
| in `urlsplit` (GH-102508) (GH-104575) (GH-104592) (#104593) |
| |
| gh-102153: Start stripping C0 control and space chars in `urlsplit` (GH-102508) |
| |
| `urllib.parse.urlsplit` has already been respecting the WHATWG spec a bit GH-25595. |
| |
| This adds more sanitizing to respect the "Remove any leading C0 control or space from input" [rule](https://url.spec.whatwg.org/GH-url-parsing:~:text=Remove%20any%20leading%20and%20trailing%20C0%20control%20or%20space%20from%20input.) in response to [CVE-2023-24329](https://nvd.nist.gov/vuln/detail/CVE-2023-24329). |
| |
| I simplified the docs by eliding the state of the world explanatory |
| paragraph in this security release only backport. (people will see |
| that in the mainline /3/ docs) |
| |
| (cherry picked from commit d7f8a5fe07b0ff3a419ccec434cc405b21a5a304) |
| (cherry picked from commit 2f630e1ce18ad2e07428296532a68b11dc66ad10) |
| (cherry picked from commit 610cc0ab1b760b2abaac92bd256b96191c46b941) |
| (cherry picked from commit f48a96a28012d28ae37a2f4587a780a5eb779946) |
| |
| Co-authored-by: Illia Volochii <illia.volochii@gmail.com> |
| Co-authored-by: Gregory P. Smith [Google] <greg@krypto.org> |
| --- |
| Doc/library/urllib.parse.rst | 38 +++++++++++- |
| Lib/test/test_urlparse.py | 61 ++++++++++++++++++- |
| Lib/urllib/parse.py | 12 ++++ |
| ...-03-07-20-59-17.gh-issue-102153.14CLSZ.rst | 3 + |
| 4 files changed, 111 insertions(+), 3 deletions(-) |
| create mode 100644 Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst |
| |
| diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst |
| index a6cfc5d3dc13..7dda121f2616 100644 |
| --- a/Doc/library/urllib.parse.rst |
| +++ b/Doc/library/urllib.parse.rst |
| @@ -147,6 +147,10 @@ or on combining URL components into a URL string. |
| ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', |
| params='', query='', fragment='') |
| |
| + .. warning:: |
| + |
| + :func:`urlparse` does not perform validation. See :ref:`URL parsing |
| + security <url-parsing-security>` for details. |
| |
| .. versionchanged:: 3.2 |
| Added IPv6 URL parsing capabilities. |
| @@ -312,8 +316,14 @@ or on combining URL components into a URL string. |
| ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is |
| decomposed before parsing, no error will be raised. |
| |
| - Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline |
| - ``\n``, ``\r`` and tab ``\t`` characters are stripped from the URL. |
| + Following some of the `WHATWG spec`_ that updates RFC 3986, leading C0 |
| + control and space characters are stripped from the URL. ``\n``, |
| + ``\r`` and tab ``\t`` characters are removed from the URL at any position. |
| + |
| + .. warning:: |
| + |
| + :func:`urlsplit` does not perform validation. See :ref:`URL parsing |
| + security <url-parsing-security>` for details. |
| |
| .. versionchanged:: 3.6 |
| Out-of-range port numbers now raise :exc:`ValueError`, instead of |
| @@ -326,6 +336,9 @@ or on combining URL components into a URL string. |
| .. versionchanged:: 3.8.10 |
| ASCII newline and tab characters are stripped from the URL. |
| |
| + .. versionchanged:: 3.8.17 |
| + Leading WHATWG C0 control and space characters are stripped from the URL. |
| + |
| .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser |
| |
| .. function:: urlunsplit(parts) |
| @@ -402,6 +415,27 @@ or on combining URL components into a URL string. |
| or ``scheme://host/path``). If *url* is not a wrapped URL, it is returned |
| without changes. |
| |
| +.. _url-parsing-security: |
| + |
| +URL parsing security |
| +-------------------- |
| + |
| +The :func:`urlsplit` and :func:`urlparse` APIs do not perform **validation** of |
| +inputs. They may not raise errors on inputs that other applications consider |
| +invalid. They may also succeed on some inputs that might not be considered |
| +URLs elsewhere. Their purpose is for practical functionality rather than |
| +purity. |
| + |
| +Instead of raising an exception on unusual input, they may instead return some |
| +component parts as empty strings. Or components may contain more than perhaps |
| +they should. |
| + |
| +We recommend that users of these APIs where the values may be used anywhere |
| +with security implications code defensively. Do some verification within your |
| +code before trusting a returned component part. Does that ``scheme`` make |
| +sense? Is that a sensible ``path``? Is there anything strange about that |
| +``hostname``? etc. |
| + |
| .. _parsing-ascii-encoded-bytes: |
| |
| Parsing ASCII Encoded Bytes |
| diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py |
| index 0f99130f5da8..0ad3bf128bbf 100644 |
| --- a/Lib/test/test_urlparse.py |
| +++ b/Lib/test/test_urlparse.py |
| @@ -660,6 +660,65 @@ def test_urlsplit_remove_unsafe_bytes(self): |
| self.assertEqual(p.scheme, "https") |
| self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment") |
| |
| + def test_urlsplit_strip_url(self): |
| + noise = bytes(range(0, 0x20 + 1)) |
| + base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag" |
| + |
| + url = noise.decode("utf-8") + base_url |
| + p = urllib.parse.urlsplit(url) |
| + self.assertEqual(p.scheme, "http") |
| + self.assertEqual(p.netloc, "User:Pass@www.python.org:080") |
| + self.assertEqual(p.path, "/doc/") |
| + self.assertEqual(p.query, "query=yes") |
| + self.assertEqual(p.fragment, "frag") |
| + self.assertEqual(p.username, "User") |
| + self.assertEqual(p.password, "Pass") |
| + self.assertEqual(p.hostname, "www.python.org") |
| + self.assertEqual(p.port, 80) |
| + self.assertEqual(p.geturl(), base_url) |
| + |
| + url = noise + base_url.encode("utf-8") |
| + p = urllib.parse.urlsplit(url) |
| + self.assertEqual(p.scheme, b"http") |
| + self.assertEqual(p.netloc, b"User:Pass@www.python.org:080") |
| + self.assertEqual(p.path, b"/doc/") |
| + self.assertEqual(p.query, b"query=yes") |
| + self.assertEqual(p.fragment, b"frag") |
| + self.assertEqual(p.username, b"User") |
| + self.assertEqual(p.password, b"Pass") |
| + self.assertEqual(p.hostname, b"www.python.org") |
| + self.assertEqual(p.port, 80) |
| + self.assertEqual(p.geturl(), base_url.encode("utf-8")) |
| + |
| + # Test that trailing space is preserved as some applications rely on |
| + # this within query strings. |
| + query_spaces_url = "https://www.python.org:88/doc/?query= " |
| + p = urllib.parse.urlsplit(noise.decode("utf-8") + query_spaces_url) |
| + self.assertEqual(p.scheme, "https") |
| + self.assertEqual(p.netloc, "www.python.org:88") |
| + self.assertEqual(p.path, "/doc/") |
| + self.assertEqual(p.query, "query= ") |
| + self.assertEqual(p.port, 88) |
| + self.assertEqual(p.geturl(), query_spaces_url) |
| + |
| + p = urllib.parse.urlsplit("www.pypi.org ") |
| + # That "hostname" gets considered a "path" due to the |
| + # trailing space and our existing logic... YUCK... |
| + # and re-assembles via geturl aka unurlsplit into the original. |
| + # django.core.validators.URLValidator (at least through v3.2) relies on |
| + # this, for better or worse, to catch it in a ValidationError via its |
| + # regular expressions. |
| + # Here we test the basic round trip concept of such a trailing space. |
| + self.assertEqual(urllib.parse.urlunsplit(p), "www.pypi.org ") |
| + |
| + # with scheme as cache-key |
| + url = "//www.python.org/" |
| + scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8") |
| + for _ in range(2): |
| + p = urllib.parse.urlsplit(url, scheme=scheme) |
| + self.assertEqual(p.scheme, "https") |
| + self.assertEqual(p.geturl(), "https://www.python.org/") |
| + |
| def test_attributes_bad_port(self): |
| """Check handling of invalid ports.""" |
| for bytes in (False, True): |
| @@ -667,7 +726,7 @@ def test_attributes_bad_port(self): |
| for port in ("foo", "1.5", "-1", "0x10"): |
| with self.subTest(bytes=bytes, parse=parse, port=port): |
| netloc = "www.example.net:" + port |
| - url = "http://" + netloc |
| + url = "http://" + netloc + "/" |
| if bytes: |
| netloc = netloc.encode("ascii") |
| url = url.encode("ascii") |
| diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py |
| index f0d9d4d803c4..979e6d2127c0 100644 |
| --- a/Lib/urllib/parse.py |
| +++ b/Lib/urllib/parse.py |
| @@ -25,6 +25,10 @@ |
| scenarios for parsing, and for backward compatibility purposes, some |
| parsing quirks from older RFCs are retained. The testcases in |
| test_urlparse.py provides a good indicator of parsing behavior. |
| + |
| +The WHATWG URL Parser spec should also be considered. We are not compliant with |
| +it either due to existing user code API behavior expectations (Hyrum's Law). |
| +It serves as a useful guide when making changes. |
| """ |
| |
| import re |
| @@ -77,6 +81,10 @@ |
| '0123456789' |
| '+-.') |
| |
| +# Leading and trailing C0 control and space to be stripped per WHATWG spec. |
| +# == "".join([chr(i) for i in range(0, 0x20 + 1)]) |
| +_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ' |
| + |
| # Unsafe bytes to be removed per WHATWG spec |
| _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] |
| |
| @@ -431,6 +439,10 @@ def urlsplit(url, scheme='', allow_fragments=True): |
| url, scheme, _coerce_result = _coerce_args(url, scheme) |
| url = _remove_unsafe_bytes_from_url(url) |
| scheme = _remove_unsafe_bytes_from_url(scheme) |
| + # Only lstrip url as some applications rely on preserving trailing space. |
| + # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) |
| + url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE) |
| + scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) |
| allow_fragments = bool(allow_fragments) |
| key = url, scheme, allow_fragments, type(url), type(scheme) |
| cached = _parse_cache.get(key, None) |
| diff --git a/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst b/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst |
| new file mode 100644 |
| index 000000000000..e57ac4ed3ac5 |
| --- /dev/null |
| +++ b/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst |
| @@ -0,0 +1,3 @@ |
| +:func:`urllib.parse.urlsplit` now strips leading C0 control and space |
| +characters following the specification for URLs defined by WHATWG in |
| +response to CVE-2023-24329. Patch by Illia Volochii. |