You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
226 lines
8.8 KiB
226 lines
8.8 KiB
From 90986ef48c0df602ab38aa831a24e99e9ed61e7e Mon Sep 17 00:00:00 2001 |
|
From: Charalampos Stratakis <cstratak@redhat.com> |
|
Date: Mon, 4 Apr 2016 15:55:28 +0200 |
|
Subject: [PATCH] JSON decoder now accepts lone surrogates |
|
|
|
--- |
|
Lib/json/decoder.py | 35 ++++++++++++------------ |
|
Lib/json/tests/test_scanstring.py | 56 ++++++++++++++++++++++++++++++++++++--- |
|
Modules/_json.c | 49 +++++++++------------------------- |
|
3 files changed, 83 insertions(+), 57 deletions(-) |
|
|
|
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py |
|
index dfcc628..1b43238 100644 |
|
--- a/Lib/json/decoder.py |
|
+++ b/Lib/json/decoder.py |
|
@@ -62,6 +62,16 @@ BACKSLASH = { |
|
|
|
DEFAULT_ENCODING = "utf-8" |
|
|
|
+def _decode_uXXXX(s, pos): |
|
+ esc = s[pos + 1:pos + 5] |
|
+ if len(esc) == 4 and esc[1] not in 'xX': |
|
+ try: |
|
+ return int(esc, 16) |
|
+ except ValueError: |
|
+ pass |
|
+ msg = "Invalid \\uXXXX escape" |
|
+ raise ValueError(errmsg(msg, s, pos)) |
|
+ |
|
def py_scanstring(s, end, encoding=None, strict=True, |
|
_b=BACKSLASH, _m=STRINGCHUNK.match): |
|
"""Scan the string s for a JSON string. End is the index of the |
|
@@ -116,25 +126,16 @@ def py_scanstring(s, end, encoding=None, strict=True, |
|
end += 1 |
|
else: |
|
# Unicode escape sequence |
|
- esc = s[end + 1:end + 5] |
|
- next_end = end + 5 |
|
- if len(esc) != 4: |
|
- msg = "Invalid \\uXXXX escape" |
|
- raise ValueError(errmsg(msg, s, end)) |
|
- uni = int(esc, 16) |
|
+ uni = _decode_uXXXX(s, end) |
|
+ end += 5 |
|
# Check for surrogate pair on UCS-4 systems |
|
- if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: |
|
- msg = "Invalid \\uXXXX\\uXXXX surrogate pair" |
|
- if not s[end + 5:end + 7] == '\\u': |
|
- raise ValueError(errmsg(msg, s, end)) |
|
- esc2 = s[end + 7:end + 11] |
|
- if len(esc2) != 4: |
|
- raise ValueError(errmsg(msg, s, end)) |
|
- uni2 = int(esc2, 16) |
|
- uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) |
|
- next_end += 6 |
|
+ if sys.maxunicode > 65535 and \ |
|
+ 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u': |
|
+ uni2 = _decode_uXXXX(s, end + 1) |
|
+ if 0xdc00 <= uni2 <= 0xdfff: |
|
+ uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) |
|
+ end += 6 |
|
char = unichr(uni) |
|
- end = next_end |
|
# Append the unescaped character |
|
_append(char) |
|
return u''.join(chunks), end |
|
diff --git a/Lib/json/tests/test_scanstring.py b/Lib/json/tests/test_scanstring.py |
|
index 4fef8cb..ed80a41 100644 |
|
--- a/Lib/json/tests/test_scanstring.py |
|
+++ b/Lib/json/tests/test_scanstring.py |
|
@@ -5,10 +5,6 @@ from json.tests import PyTest, CTest |
|
class TestScanstring(object): |
|
def test_scanstring(self): |
|
scanstring = self.json.decoder.scanstring |
|
- self.assertEqual( |
|
- scanstring('"z\\ud834\\udd20x"', 1, None, True), |
|
- (u'z\U0001d120x', 16)) |
|
- |
|
if sys.maxunicode == 65535: |
|
self.assertEqual( |
|
scanstring(u'"z\U0001d120x"', 1, None, True), |
|
@@ -94,6 +90,58 @@ class TestScanstring(object): |
|
scanstring('["Bad value", truth]', 2, None, True), |
|
(u'Bad value', 12)) |
|
|
|
+ def test_surrogates(self): |
|
+ scanstring = self.json.decoder.scanstring |
|
+ def assertScan(given, expect): |
|
+ self.assertEqual(scanstring(given, 1, None, True), |
|
+ (expect, len(given))) |
|
+ if not isinstance(given, unicode): |
|
+ given = unicode(given) |
|
+ self.assertEqual(scanstring(given, 1, None, True), |
|
+ (expect, len(given))) |
|
+ |
|
+ surrogates = unichr(0xd834) + unichr(0xdd20) |
|
+ assertScan('"z\\ud834\\u0079x"', u'z\ud834yx') |
|
+ assertScan('"z\\ud834\\udd20x"', u'z\U0001d120x') |
|
+ assertScan('"z\\ud834\\ud834\\udd20x"', u'z\ud834\U0001d120x') |
|
+ assertScan('"z\\ud834x"', u'z\ud834x') |
|
+ assertScan(u'"z\\ud834\udd20x12345"', u'z%sx12345' % surrogates) |
|
+ assertScan('"z\\udd20x"', u'z\udd20x') |
|
+ assertScan(u'"z\ud834\udd20x"', u'z\ud834\udd20x') |
|
+ assertScan(u'"z\ud834\\udd20x"', u'z%sx' % surrogates) |
|
+ assertScan(u'"z\ud834x"', u'z\ud834x') |
|
+ |
|
+ def test_bad_escapes(self): |
|
+ scanstring = self.json.decoder.scanstring |
|
+ bad_escapes = [ |
|
+ '"\\"', |
|
+ '"\\x"', |
|
+ '"\\u"', |
|
+ '"\\u0"', |
|
+ '"\\u01"', |
|
+ '"\\u012"', |
|
+ '"\\uz012"', |
|
+ '"\\u0z12"', |
|
+ '"\\u01z2"', |
|
+ '"\\u012z"', |
|
+ '"\\u0x12"', |
|
+ '"\\u0X12"', |
|
+ '"\\ud834\\"', |
|
+ '"\\ud834\\u"', |
|
+ '"\\ud834\\ud"', |
|
+ '"\\ud834\\udd"', |
|
+ '"\\ud834\\udd2"', |
|
+ '"\\ud834\\uzdd2"', |
|
+ '"\\ud834\\udzd2"', |
|
+ '"\\ud834\\uddz2"', |
|
+ '"\\ud834\\udd2z"', |
|
+ '"\\ud834\\u0x20"', |
|
+ '"\\ud834\\u0X20"', |
|
+ ] |
|
+ for s in bad_escapes: |
|
+ with self.assertRaises(ValueError): |
|
+ scanstring(s, 1, None, True) |
|
+ |
|
def test_issue3623(self): |
|
self.assertRaises(ValueError, self.json.decoder.scanstring, b"xxx", 1, |
|
"xxx") |
|
diff --git a/Modules/_json.c b/Modules/_json.c |
|
index 7c925fd..56d9ee4 100644 |
|
--- a/Modules/_json.c |
|
+++ b/Modules/_json.c |
|
@@ -524,16 +524,10 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s |
|
} |
|
#ifdef Py_UNICODE_WIDE |
|
/* Surrogate pair */ |
|
- if ((c & 0xfc00) == 0xd800) { |
|
+ if ((c & 0xfc00) == 0xd800 && end + 6 < len && |
|
+ buf[next++] == '\\' && |
|
+ buf[next++] == 'u') { |
|
Py_UNICODE c2 = 0; |
|
- if (end + 6 >= len) { |
|
- raise_errmsg("Unpaired high surrogate", pystr, end - 5); |
|
- goto bail; |
|
- } |
|
- if (buf[next++] != '\\' || buf[next++] != 'u') { |
|
- raise_errmsg("Unpaired high surrogate", pystr, end - 5); |
|
- goto bail; |
|
- } |
|
end += 6; |
|
/* Decode 4 hex digits */ |
|
for (; next < end; next++) { |
|
@@ -554,15 +548,10 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s |
|
goto bail; |
|
} |
|
} |
|
- if ((c2 & 0xfc00) != 0xdc00) { |
|
- raise_errmsg("Unpaired high surrogate", pystr, end - 5); |
|
- goto bail; |
|
- } |
|
- c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); |
|
- } |
|
- else if ((c & 0xfc00) == 0xdc00) { |
|
- raise_errmsg("Unpaired low surrogate", pystr, end - 5); |
|
- goto bail; |
|
+ if ((c2 & 0xfc00) == 0xdc00) |
|
+ c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); |
|
+ else |
|
+ end -= 6; |
|
} |
|
#endif |
|
} |
|
@@ -703,16 +692,9 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next |
|
} |
|
#ifdef Py_UNICODE_WIDE |
|
/* Surrogate pair */ |
|
- if ((c & 0xfc00) == 0xd800) { |
|
+ if ((c & 0xfc00) == 0xd800 && end + 6 < len && |
|
+ buf[next++] == '\\' && buf[next++] == 'u') { |
|
Py_UNICODE c2 = 0; |
|
- if (end + 6 >= len) { |
|
- raise_errmsg("Unpaired high surrogate", pystr, end - 5); |
|
- goto bail; |
|
- } |
|
- if (buf[next++] != '\\' || buf[next++] != 'u') { |
|
- raise_errmsg("Unpaired high surrogate", pystr, end - 5); |
|
- goto bail; |
|
- } |
|
end += 6; |
|
/* Decode 4 hex digits */ |
|
for (; next < end; next++) { |
|
@@ -733,15 +715,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next |
|
goto bail; |
|
} |
|
} |
|
- if ((c2 & 0xfc00) != 0xdc00) { |
|
- raise_errmsg("Unpaired high surrogate", pystr, end - 5); |
|
- goto bail; |
|
- } |
|
- c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); |
|
- } |
|
- else if ((c & 0xfc00) == 0xdc00) { |
|
- raise_errmsg("Unpaired low surrogate", pystr, end - 5); |
|
- goto bail; |
|
+ if ((c2 & 0xfc00) == 0xdc00) |
|
+ c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); |
|
+ else |
|
+ end -= 6; |
|
} |
|
#endif |
|
} |
|
-- |
|
2.5.5 |
|
|
|
|