You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
156 lines
6.1 KiB
156 lines
6.1 KiB
diff --git a/Doc/library/urlparse.rst b/Doc/library/urlparse.rst |
|
index efd112d..61022f7 100644 |
|
--- a/Doc/library/urlparse.rst |
|
+++ b/Doc/library/urlparse.rst |
|
@@ -118,6 +118,12 @@ The :mod:`urlparse` module defines the following functions: |
|
See section :ref:`urlparse-result-object` for more information on the result |
|
object. |
|
|
|
+ Characters in the :attr:`netloc` attribute that decompose under NFKC |
|
+ normalization (as used by the IDNA encoding) into any of ``/``, ``?``, |
|
+ ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is |
|
+ decomposed before parsing, or is not a Unicode string, no error will be |
|
+ raised. |
|
+ |
|
.. versionchanged:: 2.5 |
|
Added attributes to return value. |
|
|
|
@@ -125,6 +131,11 @@ The :mod:`urlparse` module defines the following functions: |
|
Added IPv6 URL parsing capabilities. |
|
|
|
|
|
+ .. versionchanged:: 2.7.17 |
|
+ Characters that affect netloc parsing under NFKC normalization will |
|
+ now raise :exc:`ValueError`. |
|
+ |
|
+ |
|
.. function:: parse_qs(qs[, keep_blank_values[, strict_parsing]]) |
|
|
|
Parse a query string given as a string argument (data of type |
|
@@ -219,11 +230,21 @@ The :mod:`urlparse` module defines the following functions: |
|
See section :ref:`urlparse-result-object` for more information on the result |
|
object. |
|
|
|
+ Characters in the :attr:`netloc` attribute that decompose under NFKC |
|
+ normalization (as used by the IDNA encoding) into any of ``/``, ``?``, |
|
+ ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is |
|
+ decomposed before parsing, or is not a Unicode string, no error will be |
|
+ raised. |
|
+ |
|
.. versionadded:: 2.2 |
|
|
|
.. versionchanged:: 2.5 |
|
Added attributes to return value. |
|
|
|
+ .. versionchanged:: 2.7.17 |
|
+ Characters that affect netloc parsing under NFKC normalization will |
|
+ now raise :exc:`ValueError`. |
|
+ |
|
|
|
.. function:: urlunsplit(parts) |
|
|
|
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py |
|
index 72ebfaa..2717163 100644 |
|
--- a/Lib/test/test_urlparse.py |
|
+++ b/Lib/test/test_urlparse.py |
|
@@ -1,6 +1,8 @@ |
|
#! /usr/bin/env python |
|
|
|
from test import test_support |
|
+import sys |
|
+import unicodedata |
|
import unittest |
|
import urlparse |
|
|
|
@@ -564,6 +566,45 @@ class UrlParseTestCase(unittest.TestCase): |
|
self.assertEqual(urlparse.urlparse("http://www.python.org:80"), |
|
('http','www.python.org:80','','','','')) |
|
|
|
+ def test_urlsplit_normalization(self): |
|
+ # Certain characters should never occur in the netloc, |
|
+ # including under normalization. |
|
+ # Ensure that ALL of them are detected and cause an error |
|
+ illegal_chars = u'/:#?@' |
|
+ hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars} |
|
+ denorm_chars = [ |
|
+ c for c in map(unichr, range(128, sys.maxunicode)) |
|
+ if (hex_chars & set(unicodedata.decomposition(c).split())) |
|
+ and c not in illegal_chars |
|
+ ] |
|
+ # Sanity check that we found at least one such character |
|
+ self.assertIn(u'\u2100', denorm_chars) |
|
+ self.assertIn(u'\uFF03', denorm_chars) |
|
+ |
|
+ # bpo-36742: Verify port separators are ignored when they |
|
+ # existed prior to decomposition |
|
+ urlparse.urlsplit(u'http://\u30d5\u309a:80') |
|
+ with self.assertRaises(ValueError): |
|
+ urlparse.urlsplit(u'http://\u30d5\u309a\ufe1380') |
|
+ |
|
+ for scheme in [u"http", u"https", u"ftp"]: |
|
+ for netloc in [u"netloc{}false.netloc", u"n{}user@netloc"]: |
|
+ for c in denorm_chars: |
|
+ url = u"{}://{}/path".format(scheme, netloc.format(c)) |
|
+ if test_support.verbose: |
|
+ print "Checking %r" % url |
|
+ with self.assertRaises(ValueError): |
|
+ urlparse.urlsplit(url) |
|
+ |
|
+ # check error message: invalid netloc must be formated with repr() |
|
+ # to get an ASCII error message |
|
+ with self.assertRaises(ValueError) as cm: |
|
+ urlparse.urlsplit(u'http://example.com\uFF03@bing.com') |
|
+ self.assertEqual(str(cm.exception), |
|
+ "netloc u'example.com\\uff03@bing.com' contains invalid characters " |
|
+ "under NFKC normalization") |
|
+ self.assertIsInstance(cm.exception.args[0], str) |
|
+ |
|
def test_main(): |
|
test_support.run_unittest(UrlParseTestCase) |
|
|
|
diff --git a/Lib/urlparse.py b/Lib/urlparse.py |
|
index 4ce982e..9a1df74 100644 |
|
--- a/Lib/urlparse.py |
|
+++ b/Lib/urlparse.py |
|
@@ -164,6 +164,25 @@ def _splitnetloc(url, start=0): |
|
delim = min(delim, wdelim) # use earliest delim position |
|
return url[start:delim], url[delim:] # return (domain, rest) |
|
|
|
+def _checknetloc(netloc): |
|
+ if not netloc or not isinstance(netloc, unicode): |
|
+ return |
|
+ # looking for characters like \u2100 that expand to 'a/c' |
|
+ # IDNA uses NFKC equivalence, so normalize for this check |
|
+ import unicodedata |
|
+ n = netloc.replace(u'@', u'') # ignore characters already included |
|
+ n = n.replace(u':', u'') # but not the surrounding text |
|
+ n = n.replace(u'#', u'') |
|
+ n = n.replace(u'?', u'') |
|
+ netloc2 = unicodedata.normalize('NFKC', n) |
|
+ if n == netloc2: |
|
+ return |
|
+ for c in '/?#@:': |
|
+ if c in netloc2: |
|
+ raise ValueError("netloc %r contains invalid characters " |
|
+ "under NFKC normalization" |
|
+ % netloc) |
|
+ |
|
def urlsplit(url, scheme='', allow_fragments=True): |
|
"""Parse a URL into 5 components: |
|
<scheme>://<netloc>/<path>?<query>#<fragment> |
|
@@ -192,6 +211,7 @@ def urlsplit(url, scheme='', allow_fragments=True): |
|
url, fragment = url.split('#', 1) |
|
if '?' in url: |
|
url, query = url.split('?', 1) |
|
+ _checknetloc(netloc) |
|
v = SplitResult(scheme, netloc, url, query, fragment) |
|
_parse_cache[key] = v |
|
return v |
|
@@ -215,6 +235,7 @@ def urlsplit(url, scheme='', allow_fragments=True): |
|
url, fragment = url.split('#', 1) |
|
if '?' in url: |
|
url, query = url.split('?', 1) |
|
+ _checknetloc(netloc) |
|
v = SplitResult(scheme, netloc, url, query, fragment) |
|
_parse_cache[key] = v |
|
return v
|
|
|