| From 6c472d3a1d334d4eeb4a25eba7bf3b01611bf667 Mon Sep 17 00:00:00 2001 |
| From: "Miss Islington (bot)" |
| <31488909+miss-islington@users.noreply.github.com> |
| Date: Thu, 6 May 2021 09:56:01 -0700 |
| Subject: [PATCH] [3.6] bpo-43882 - urllib.parse should sanitize urls |
| containing ASCII newline and tabs (GH-25924) |
| |
| Co-authored-by: Gregory P. Smith <greg@krypto.org> |
| Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> |
| (cherry picked from commit 76cd81d60310d65d01f9d7b48a8985d8ab89c8b4) |
| Co-authored-by: Senthil Kumaran <senthil@uthcode.com> |
| (cherry picked from commit 515a7bc4e13645d0945b46a8e1d9102b918cd407) |
| |
| Co-authored-by: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> |
| --- |
| Doc/library/urllib.parse.rst | 13 +++++ |
| Lib/test/test_urlparse.py | 48 +++++++++++++++++++ |
| Lib/urllib/parse.py | 10 ++++ |
| .../2021-04-25-07-46-37.bpo-43882.Jpwx85.rst | 6 +++ |
| 4 files changed, 77 insertions(+) |
| create mode 100644 Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst |
| |
| diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst |
| index 3c2e37ef20..b717d7cc05 100644 |
| --- a/Doc/library/urllib.parse.rst |
| +++ b/Doc/library/urllib.parse.rst |
| @@ -288,6 +288,9 @@ or on combining URL components into a URL string. |
| ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is |
| decomposed before parsing, no error will be raised. |
| |
| + Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline |
| + ``\n``, ``\r`` and tab ``\t`` characters are stripped from the URL. |
| + |
| .. versionchanged:: 3.6 |
| Out-of-range port numbers now raise :exc:`ValueError`, instead of |
| returning :const:`None`. |
| @@ -296,6 +299,10 @@ or on combining URL components into a URL string. |
| Characters that affect netloc parsing under NFKC normalization will |
| now raise :exc:`ValueError`. |
| |
| + .. versionchanged:: 3.6.14 |
| + ASCII newline and tab characters are stripped from the URL. |
| + |
| +.. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser |
| |
| .. function:: urlunsplit(parts) |
| |
| @@ -633,6 +640,10 @@ task isn't already covered by the URL parsing functions above. |
| |
| .. seealso:: |
| |
| + `WHATWG`_ - URL Living standard |
| + Working Group for the URL Standard that defines URLs, domains, IP addresses, the |
| + application/x-www-form-urlencoded format, and their API. |
| + |
| :rfc:`3986` - Uniform Resource Identifiers |
| This is the current standard (STD66). Any changes to urllib.parse module |
| should conform to this. Certain deviations could be observed, which are |
| @@ -656,3 +667,5 @@ task isn't already covered by the URL parsing functions above. |
| |
| :rfc:`1738` - Uniform Resource Locators (URL) |
| This specifies the formal syntax and semantics of absolute URLs. |
| + |
| +.. _WHATWG: https://url.spec.whatwg.org/ |
| diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py |
| index e3088b2f39..3509278a01 100644 |
| --- a/Lib/test/test_urlparse.py |
| +++ b/Lib/test/test_urlparse.py |
| @@ -612,6 +612,54 @@ class UrlParseTestCase(unittest.TestCase): |
| with self.assertRaisesRegex(ValueError, "out of range"): |
| p.port |
| |
| + def test_urlsplit_remove_unsafe_bytes(self): |
| + # Remove ASCII tabs and newlines from input, for http common case scenario. |
| + url = "h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" |
| + p = urllib.parse.urlsplit(url) |
| + self.assertEqual(p.scheme, "http") |
| + self.assertEqual(p.netloc, "www.python.org") |
| + self.assertEqual(p.path, "/javascript:alert('msg')/") |
| + self.assertEqual(p.query, "query=something") |
| + self.assertEqual(p.fragment, "fragment") |
| + self.assertEqual(p.username, None) |
| + self.assertEqual(p.password, None) |
| + self.assertEqual(p.hostname, "www.python.org") |
| + self.assertEqual(p.port, None) |
| + self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment") |
| + |
| + # Remove ASCII tabs and newlines from input as bytes, for http common case scenario. |
| + url = b"h\nttp://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" |
| + p = urllib.parse.urlsplit(url) |
| + self.assertEqual(p.scheme, b"http") |
| + self.assertEqual(p.netloc, b"www.python.org") |
| + self.assertEqual(p.path, b"/javascript:alert('msg')/") |
| + self.assertEqual(p.query, b"query=something") |
| + self.assertEqual(p.fragment, b"fragment") |
| + self.assertEqual(p.username, None) |
| + self.assertEqual(p.password, None) |
| + self.assertEqual(p.hostname, b"www.python.org") |
| + self.assertEqual(p.port, None) |
| + self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/?query=something#fragment") |
| + |
| + # any scheme |
| + url = "x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" |
| + p = urllib.parse.urlsplit(url) |
| + self.assertEqual(p.geturl(), "x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment") |
| + |
| + # Remove ASCII tabs and newlines from input as bytes, any scheme. |
| + url = b"x-new-scheme\t://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" |
| + p = urllib.parse.urlsplit(url) |
| + self.assertEqual(p.geturl(), b"x-new-scheme://www.python.org/javascript:alert('msg')/?query=something#fragment") |
| + |
| + # Unsafe bytes is not returned from urlparse cache. |
| + # scheme is stored after parsing, sending an scheme with unsafe bytes *will not* return an unsafe scheme |
| + url = "https://www.python\n.org\t/java\nscript:\talert('msg\r\n')/?query\n=\tsomething#frag\nment" |
| + scheme = "htt\nps" |
| + for _ in range(2): |
| + p = urllib.parse.urlsplit(url, scheme=scheme) |
| + self.assertEqual(p.scheme, "https") |
| + self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment") |
| + |
| def test_attributes_bad_port(self): |
| """Check handling of invalid ports.""" |
| for bytes in (False, True): |
| diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py |
| index 66056bf589..ac6e7a9cee 100644 |
| --- a/Lib/urllib/parse.py |
| +++ b/Lib/urllib/parse.py |
| @@ -76,6 +76,9 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz' |
| '0123456789' |
| '+-.') |
| |
| +# Unsafe bytes to be removed per WHATWG spec |
| +_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] |
| + |
| # XXX: Consider replacing with functools.lru_cache |
| MAX_CACHE_SIZE = 20 |
| _parse_cache = {} |
| @@ -409,6 +412,11 @@ def _checknetloc(netloc): |
| raise ValueError("netloc '" + netloc + "' contains invalid " + |
| "characters under NFKC normalization") |
| |
| +def _remove_unsafe_bytes_from_url(url): |
| + for b in _UNSAFE_URL_BYTES_TO_REMOVE: |
| + url = url.replace(b, "") |
| + return url |
| + |
| def urlsplit(url, scheme='', allow_fragments=True): |
| """Parse a URL into 5 components: |
| <scheme>://<netloc>/<path>?<query>#<fragment> |
| @@ -416,6 +424,8 @@ def urlsplit(url, scheme='', allow_fragments=True): |
| Note that we don't break the components up in smaller bits |
| (e.g. netloc is a single string) and we don't expand % escapes.""" |
| url, scheme, _coerce_result = _coerce_args(url, scheme) |
| + url = _remove_unsafe_bytes_from_url(url) |
| + scheme = _remove_unsafe_bytes_from_url(scheme) |
| allow_fragments = bool(allow_fragments) |
| key = url, scheme, allow_fragments, type(url), type(scheme) |
| cached = _parse_cache.get(key, None) |
| diff --git a/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst b/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst |
| new file mode 100644 |
| index 0000000000..a326d079df |
| --- /dev/null |
| +++ b/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst |
| @@ -0,0 +1,6 @@ |
| +The presence of newline or tab characters in parts of a URL could allow |
| +some forms of attacks. |
| + |
| +Following the controlling specification for URLs defined by WHATWG |
| +:func:`urllib.parse` now removes ASCII newlines and tabs from URLs, |
| +preventing such attacks. |
| -- |
| 2.41.0.255.g8b1d071c50-goog |
| |