From 683832a3434a04ee097e04cf4aed1eec5c9a0b67 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Wed, 24 Nov 2021 21:08:38 -0800 Subject: [PATCH 1/8] Touch up doc changes from PR 828 --- src/de.rs | 18 ++++++++---------- src/read.rs | 9 ++++----- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/de.rs b/src/de.rs index d9a5fee8c..a01c4767a 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1560,7 +1560,8 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { /// /// # Examples /// - /// You can use this to parse JSON strings containing invalid UTF-8 bytes. + /// You can use this to parse JSON strings containing invalid UTF-8 bytes, + /// or unpaired surrogates. /// /// ``` /// use serde_bytes::ByteBuf; @@ -1580,21 +1581,18 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { /// ``` /// /// Backslash escape sequences like `\n` are still interpreted and required - /// to be valid. `\u` escape sequences are required to represent valid - /// Unicode code points, except in the case of lone surrogates. + /// to be valid. `\u` escape sequences are required to represent a valid + /// Unicode code point or lone surrogate. /// /// ``` /// use serde_bytes::ByteBuf; /// - /// fn look_at_bytes() { + /// fn look_at_bytes() -> Result<(), serde_json::Error> { /// let json_data = b"\"lone surrogate: \\uD801\""; - /// let parsed: Result = serde_json::from_slice(json_data); - /// - /// assert!(parsed.is_ok()); - /// + /// let bytes: ByteBuf = serde_json::from_slice(json_data)?; /// let expected = b"lone surrogate: \xED\xA0\x81"; - /// let bytes: ByteBuf = parsed.unwrap(); - /// assert_eq!(expected, &bytes[..]); + /// assert_eq!(expected, bytes.as_slice()); + /// Ok(()) /// } /// # /// # look_at_bytes(); diff --git a/src/read.rs b/src/read.rs index 034cc6557..3d72880ea 100644 --- a/src/read.rs +++ b/src/read.rs @@ -875,11 +875,10 @@ fn parse_escape<'de, R: Read<'de>>( return Ok(()); } - // Non-BMP characters are encoded as a sequence of - // two hex escapes, representing UTF-16 surrogates. - // If `validate` is false and we only find a single - // hex escape that is a surrogate, then we'll accept - // it instead of erroring. + // Non-BMP characters are encoded as a sequence of two hex + // escapes, representing UTF-16 surrogates. If deserializing a + // utf-8 string the surrogates are required to be paired, + // whereas deserializing a byte string accepts lone surrogates. n1 @ 0xD800..=0xDBFF => { if tri!(peek_or_eof(read)) != b'\\' { if validate { From 48dad22b3f2cfc1c4c8dd19976cbe088da932eb4 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Wed, 24 Nov 2021 21:10:13 -0800 Subject: [PATCH 2/8] Collapse surrogate encode into extend_from_slice call --- src/read.rs | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/read.rs b/src/read.rs index 3d72880ea..035c865d3 100644 --- a/src/read.rs +++ b/src/read.rs @@ -864,13 +864,11 @@ fn parse_escape<'de, R: Read<'de>>( return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); } - let utf8_bytes = [ + scratch.extend_from_slice(&[ (n >> 12 & 0x0F) as u8 | 0b1110_0000, (n >> 6 & 0x3F) as u8 | 0b1000_0000, (n & 0x3F) as u8 | 0b1000_0000, - ]; - - scratch.extend_from_slice(&utf8_bytes); + ]); return Ok(()); } @@ -886,13 +884,11 @@ fn parse_escape<'de, R: Read<'de>>( return error(read, ErrorCode::UnexpectedEndOfHexEscape); } - let utf8_bytes = [ + scratch.extend_from_slice(&[ (n1 >> 12 & 0x0F) as u8 | 0b1110_0000, (n1 >> 6 & 0x3F) as u8 | 0b1000_0000, (n1 & 0x3F) as u8 | 0b1000_0000, - ]; - - scratch.extend_from_slice(&utf8_bytes); + ]); return Ok(()); } @@ -904,13 +900,11 @@ fn parse_escape<'de, R: Read<'de>>( return error(read, ErrorCode::UnexpectedEndOfHexEscape); } - let utf8_bytes = [ + scratch.extend_from_slice(&[ (n1 >> 12 & 0x0F) as u8 | 0b1110_0000, (n1 >> 6 & 0x3F) as u8 | 0b1000_0000, (n1 & 0x3F) as u8 | 0b1000_0000, - ]; - - scratch.extend_from_slice(&utf8_bytes); + ]); // The \ prior to this byte started an escape sequence, // so we need to parse that now. From 7911e704a0fb762ddda89951f01f02457074ca02 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Wed, 24 Nov 2021 21:10:49 -0800 Subject: [PATCH 3/8] Tail recurse on parse_escape --- src/read.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/read.rs b/src/read.rs index 035c865d3..f2396b81c 100644 --- a/src/read.rs +++ b/src/read.rs @@ -908,9 +908,7 @@ fn parse_escape<'de, R: Read<'de>>( // The \ prior to this byte started an escape sequence, // so we need to parse that now. - parse_escape(read, validate, scratch)?; - - return Ok(()); + return parse_escape(read, validate, scratch); } read.discard(); From cb4a2517b2a8b20449fe0923a9fb9133bacc1b4e Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Wed, 24 Nov 2021 21:12:52 -0800 Subject: [PATCH 4/8] Document why the parse_escape recursion is not dangerous --- src/read.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/read.rs b/src/read.rs index f2396b81c..6cb6780f8 100644 --- a/src/read.rs +++ b/src/read.rs @@ -907,7 +907,10 @@ fn parse_escape<'de, R: Read<'de>>( ]); // The \ prior to this byte started an escape sequence, - // so we need to parse that now. + // so we need to parse that now. This recursive call + // does not blow the stack on malicious input because + // the escape is not \u, so it will be handled by one + // of the easy nonrecursive cases. return parse_escape(read, validate, scratch); } read.discard(); From 11d3464f1cd9e48aabb3630773a03da845838c7a Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Wed, 24 Nov 2021 21:14:03 -0800 Subject: [PATCH 5/8] Extract common logic of surrogate encode --- src/read.rs | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/read.rs b/src/read.rs index 6cb6780f8..321f16cd4 100644 --- a/src/read.rs +++ b/src/read.rs @@ -858,17 +858,21 @@ fn parse_escape<'de, R: Read<'de>>( b'r' => scratch.push(b'\r'), b't' => scratch.push(b'\t'), b'u' => { + fn encode_surrogate(scratch: &mut Vec, n: u16) { + scratch.extend_from_slice(&[ + (n >> 12 & 0x0F) as u8 | 0b1110_0000, + (n >> 6 & 0x3F) as u8 | 0b1000_0000, + (n & 0x3F) as u8 | 0b1000_0000, + ]); + } + let c = match tri!(read.decode_hex_escape()) { n @ 0xDC00..=0xDFFF => { if validate { return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); } - scratch.extend_from_slice(&[ - (n >> 12 & 0x0F) as u8 | 0b1110_0000, - (n >> 6 & 0x3F) as u8 | 0b1000_0000, - (n & 0x3F) as u8 | 0b1000_0000, - ]); + encode_surrogate(scratch, n); return Ok(()); } @@ -884,11 +888,7 @@ fn parse_escape<'de, R: Read<'de>>( return error(read, ErrorCode::UnexpectedEndOfHexEscape); } - scratch.extend_from_slice(&[ - (n1 >> 12 & 0x0F) as u8 | 0b1110_0000, - (n1 >> 6 & 0x3F) as u8 | 0b1000_0000, - (n1 & 0x3F) as u8 | 0b1000_0000, - ]); + encode_surrogate(scratch, n1); return Ok(()); } @@ -900,11 +900,7 @@ fn parse_escape<'de, R: Read<'de>>( return error(read, ErrorCode::UnexpectedEndOfHexEscape); } - scratch.extend_from_slice(&[ - (n1 >> 12 & 0x0F) as u8 | 0b1110_0000, - (n1 >> 6 & 0x3F) as u8 | 0b1000_0000, - (n1 & 0x3F) as u8 | 0b1000_0000, - ]); + encode_surrogate(scratch, n1); // The \ prior to this byte started an escape sequence, // so we need to parse that now. This recursive call From 311f185d8e5d3c37ed284c692d0b2e493c38fb33 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Wed, 24 Nov 2021 21:15:24 -0800 Subject: [PATCH 6/8] Use binary mask to line up visually with the bits being |'d in --- src/read.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/read.rs b/src/read.rs index 321f16cd4..50111b605 100644 --- a/src/read.rs +++ b/src/read.rs @@ -860,9 +860,9 @@ fn parse_escape<'de, R: Read<'de>>( b'u' => { fn encode_surrogate(scratch: &mut Vec, n: u16) { scratch.extend_from_slice(&[ - (n >> 12 & 0x0F) as u8 | 0b1110_0000, - (n >> 6 & 0x3F) as u8 | 0b1000_0000, - (n & 0x3F) as u8 | 0b1000_0000, + (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000, + (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000, + (n & 0b0011_1111) as u8 | 0b1000_0000, ]); } From 142207623acd222a650511df39ffbcc75ad87a42 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Wed, 24 Nov 2021 21:16:58 -0800 Subject: [PATCH 7/8] Rearrange the early return on lone or encoded surrogate --- src/read.rs | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/src/read.rs b/src/read.rs index 50111b605..4b355fd43 100644 --- a/src/read.rs +++ b/src/read.rs @@ -868,13 +868,12 @@ fn parse_escape<'de, R: Read<'de>>( let c = match tri!(read.decode_hex_escape()) { n @ 0xDC00..=0xDFFF => { - if validate { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } - - encode_surrogate(scratch, n); - - return Ok(()); + return if validate { + error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) + } else { + encode_surrogate(scratch, n); + Ok(()) + }; } // Non-BMP characters are encoded as a sequence of two hex @@ -883,31 +882,29 @@ fn parse_escape<'de, R: Read<'de>>( // whereas deserializing a byte string accepts lone surrogates. n1 @ 0xD800..=0xDBFF => { if tri!(peek_or_eof(read)) != b'\\' { - if validate { + return if validate { read.discard(); - return error(read, ErrorCode::UnexpectedEndOfHexEscape); - } - - encode_surrogate(scratch, n1); - - return Ok(()); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + encode_surrogate(scratch, n1); + Ok(()) + }; } read.discard(); if tri!(peek_or_eof(read)) != b'u' { - if validate { + return if validate { read.discard(); - return error(read, ErrorCode::UnexpectedEndOfHexEscape); - } - - encode_surrogate(scratch, n1); - - // The \ prior to this byte started an escape sequence, - // so we need to parse that now. This recursive call - // does not blow the stack on malicious input because - // the escape is not \u, so it will be handled by one - // of the easy nonrecursive cases. - return parse_escape(read, validate, scratch); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + encode_surrogate(scratch, n1); + // The \ prior to this byte started an escape sequence, + // so we need to parse that now. This recursive call + // does not blow the stack on malicious input because + // the escape is not \u, so it will be handled by one + // of the easy nonrecursive cases. + parse_escape(read, validate, scratch) + }; } read.discard(); From 265fb7ee40d8ef25cb05acc9dc4735621cb1341e Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Wed, 24 Nov 2021 21:17:43 -0800 Subject: [PATCH 8/8] Move discard of expected byte immediately after peek --- src/read.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/read.rs b/src/read.rs index 4b355fd43..2621f3f3f 100644 --- a/src/read.rs +++ b/src/read.rs @@ -881,7 +881,9 @@ fn parse_escape<'de, R: Read<'de>>( // utf-8 string the surrogates are required to be paired, // whereas deserializing a byte string accepts lone surrogates. n1 @ 0xD800..=0xDBFF => { - if tri!(peek_or_eof(read)) != b'\\' { + if tri!(peek_or_eof(read)) == b'\\' { + read.discard(); + } else { return if validate { read.discard(); error(read, ErrorCode::UnexpectedEndOfHexEscape) @@ -890,9 +892,10 @@ fn parse_escape<'de, R: Read<'de>>( Ok(()) }; } - read.discard(); - if tri!(peek_or_eof(read)) != b'u' { + if tri!(peek_or_eof(read)) == b'u' { + read.discard(); + } else { return if validate { read.discard(); error(read, ErrorCode::UnexpectedEndOfHexEscape) @@ -906,7 +909,6 @@ fn parse_escape<'de, R: Read<'de>>( parse_escape(read, validate, scratch) }; } - read.discard(); let n2 = tri!(read.decode_hex_escape());