From 4bca34eb6af12d2aed0cdce7f395fcf8ac6dafab Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 7 Sep 2022 17:23:54 -0400 Subject: [PATCH] gh-96268: Fix loading invalid UTF-8 (GH-96270) This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8. It also fixes an off-by-one error introduced in 3.10 for the line number when the tokenizer reports bad UTF8. (cherry picked from commit 8bc356a7dd50cbdb46d10b8c7e457832431f5d9e) Co-authored-by: Michael Droettboom --- Lib/test/test_source_encoding.py | 13 ++++- ...2-08-25-10-19-34.gh-issue-96268.AbYrLB.rst | 2 + Parser/tokenizer.c | 58 ++++++++++++++----- 3 files changed, 57 insertions(+), 16 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index d37914dcd74883..e357264eb1d165 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -248,8 +248,10 @@ def test_invalid_utf8(self): # test it is to write actual files to disk. # Each example is put inside a string at the top of the file so - # it's an otherwise valid Python source file. - template = b'"%s"\n' + # it's an otherwise valid Python source file. Put some newlines + # beforehand so we can assert that the error is reported on the + # correct line. + template = b'\n\n\n"%s"\n' fn = TESTFN self.addCleanup(unlink, fn) @@ -257,7 +259,12 @@ def test_invalid_utf8(self): def check(content): with open(fn, 'wb') as fp: fp.write(template % content) - script_helper.assert_python_failure(fn) + rc, stdout, stderr = script_helper.assert_python_failure(fn) + # We want to assert that the python subprocess failed gracefully, + # not via a signal. + self.assertGreaterEqual(rc, 1) + self.assertIn(b"Non-UTF-8 code starting with", stderr) + self.assertIn(b"on line 4", stderr) # continuation bytes in a sequence of 2, 3, or 4 bytes continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst new file mode 100644 index 00000000000000..987d85ff3bab8e --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst @@ -0,0 +1,2 @@ +Loading a file with invalid UTF-8 will now report the broken character at +the correct location. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index b5ebcd044f898a..8d9fbf5cf9596a 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -486,25 +486,59 @@ static void fp_ungetc(int c, struct tok_state *tok) { /* Check whether the characters at s start a valid UTF-8 sequence. Return the number of characters forming - the sequence if yes, 0 if not. */ -static int valid_utf8(const unsigned char* s) + the sequence if yes, 0 if not. The special cases match + those in stringlib/codecs.h:utf8_decode. +*/ +static int +valid_utf8(const unsigned char* s) { int expected = 0; int length; - if (*s < 0x80) + if (*s < 0x80) { /* single-byte code */ return 1; - if (*s < 0xc0) - /* following byte */ - return 0; - if (*s < 0xE0) + } + else if (*s < 0xE0) { + /* \xC2\x80-\xDF\xBF -- 0080-07FF */ + if (*s < 0xC2) { + /* invalid sequence + \x80-\xBF -- continuation byte + \xC0-\xC1 -- fake 0000-007F */ + return 0; + } expected = 1; - else if (*s < 0xF0) + } + else if (*s < 0xF0) { + /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ + if (*s == 0xE0 && *(s + 1) < 0xA0) { + /* invalid sequence + \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ + return 0; + } + else if (*s == 0xED && *(s + 1) >= 0xA0) { + /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF + will result in surrogates in range D800-DFFF. Surrogates are + not valid UTF-8 so they are rejected. + See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf + (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ + return 0; + } expected = 2; - else if (*s < 0xF8) + } + else if (*s < 0xF5) { + /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ + if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { + /* invalid sequence -- one of: + \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF + \xF4\x90\x80\x80- -- 110000- overflow */ + return 0; + } expected = 3; - else + } + else { + /* invalid start byte */ return 0; + } length = expected + 1; for (; expected; expected--) if (s[expected] < 0x80 || s[expected] >= 0xC0) @@ -525,14 +559,12 @@ ensure_utf8(char *line, struct tok_state *tok) } } if (badchar) { - /* Need to add 1 to the line number, since this line - has not been counted, yet. */ PyErr_Format(PyExc_SyntaxError, "Non-UTF-8 code starting with '\\x%.2x' " "in file %U on line %i, " "but no encoding declared; " "see https://peps.python.org/pep-0263/ for details", - badchar, tok->filename, tok->lineno + 1); + badchar, tok->filename, tok->lineno); return 0; } return 1;