From 683832a3434a04ee097e04cf4aed1eec5c9a0b67 Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Wed, 24 Nov 2021 21:08:38 -0800
Subject: [PATCH 1/8] Touch up doc changes from PR 828

---
 src/de.rs   | 18 ++++++++----------
 src/read.rs |  9 ++++-----
 2 files changed, 12 insertions(+), 15 deletions(-)
diff --git a/src/de.rs b/src/de.rs
index d9a5fee8c..a01c4767a 100644
--- a/src/de.rs
+++ b/src/de.rs
@@ -1560,7 +1560,8 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
     ///
     /// # Examples
     ///
-    /// You can use this to parse JSON strings containing invalid UTF-8 bytes.
+    /// You can use this to parse JSON strings containing invalid UTF-8 bytes,
+    /// or unpaired surrogates.
     ///
     /// ```
     /// use serde_bytes::ByteBuf;
@@ -1580,21 +1581,18 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
     /// ```
     ///
     /// Backslash escape sequences like `\n` are still interpreted and required
-    /// to be valid. `\u` escape sequences are required to represent valid
-    /// Unicode code points, except in the case of lone surrogates.
+    /// to be valid. `\u` escape sequences are required to represent a valid
+    /// Unicode code point or lone surrogate.
     ///
     /// ```
     /// use serde_bytes::ByteBuf;
     ///
-    /// fn look_at_bytes() {
+    /// fn look_at_bytes() -> Result<(), serde_json::Error> {
     ///     let json_data = b"\"lone surrogate: \\uD801\"";
-    ///     let parsed: Result<ByteBuf, _> = serde_json::from_slice(json_data);
-    ///
-    ///     assert!(parsed.is_ok());
-    ///
+    ///     let bytes: ByteBuf = serde_json::from_slice(json_data)?;
     ///     let expected = b"lone surrogate: \xED\xA0\x81";
-    ///     let bytes: ByteBuf = parsed.unwrap();
-    ///     assert_eq!(expected, &bytes[..]);
+    ///     assert_eq!(expected, bytes.as_slice());
+    ///     Ok(())
     /// }
     /// #
     /// # look_at_bytes();
diff --git a/src/read.rs b/src/read.rs
index 034cc6557..3d72880ea 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -875,11 +875,10 @@ fn parse_escape<'de, R: Read<'de>>(
                     return Ok(());
                 }
 
-                // Non-BMP characters are encoded as a sequence of
-                // two hex escapes, representing UTF-16 surrogates.
-                // If `validate` is false and we only find a single
-                // hex escape that is a surrogate, then we'll accept
-                // it instead of erroring.
+                // Non-BMP characters are encoded as a sequence of two hex
+                // escapes, representing UTF-16 surrogates. If deserializing a
+                // utf-8 string the surrogates are required to be paired,
+                // whereas deserializing a byte string accepts lone surrogates.
                 n1 @ 0xD800..=0xDBFF => {
                     if tri!(peek_or_eof(read)) != b'\\' {
                         if validate {

From 48dad22b3f2cfc1c4c8dd19976cbe088da932eb4 Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Wed, 24 Nov 2021 21:10:13 -0800
Subject: [PATCH 2/8] Collapse surrogate encode into extend_from_slice call

---
 src/read.rs | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/read.rs b/src/read.rs
index 3d72880ea..035c865d3 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -864,13 +864,11 @@ fn parse_escape<'de, R: Read<'de>>(
                         return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
                     }
 
-                    let utf8_bytes = [
+                    scratch.extend_from_slice(&[
                         (n >> 12 & 0x0F) as u8 | 0b1110_0000,
                         (n >> 6 & 0x3F) as u8 | 0b1000_0000,
                         (n & 0x3F) as u8 | 0b1000_0000,
-                    ];
-
-                    scratch.extend_from_slice(&utf8_bytes);
+                    ]);
 
                     return Ok(());
                 }
@@ -886,13 +884,11 @@ fn parse_escape<'de, R: Read<'de>>(
                             return error(read, ErrorCode::UnexpectedEndOfHexEscape);
                         }
 
-                        let utf8_bytes = [
+                        scratch.extend_from_slice(&[
                             (n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
                             (n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
                             (n1 & 0x3F) as u8 | 0b1000_0000,
-                        ];
-
-                        scratch.extend_from_slice(&utf8_bytes);
+                        ]);
 
                         return Ok(());
                     }
@@ -904,13 +900,11 @@ fn parse_escape<'de, R: Read<'de>>(
                             return error(read, ErrorCode::UnexpectedEndOfHexEscape);
                         }
 
-                        let utf8_bytes = [
+                        scratch.extend_from_slice(&[
                             (n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
                             (n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
                             (n1 & 0x3F) as u8 | 0b1000_0000,
-                        ];
-
-                        scratch.extend_from_slice(&utf8_bytes);
+                        ]);
 
                         // The \ prior to this byte started an escape sequence,
                         // so we need to parse that now.

From 7911e704a0fb762ddda89951f01f02457074ca02 Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Wed, 24 Nov 2021 21:10:49 -0800
Subject: [PATCH 3/8] Tail recurse on parse_escape

---
 src/read.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/read.rs b/src/read.rs
index 035c865d3..f2396b81c 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -908,9 +908,7 @@ fn parse_escape<'de, R: Read<'de>>(
 
                         // The \ prior to this byte started an escape sequence,
                         // so we need to parse that now.
-                        parse_escape(read, validate, scratch)?;
-
-                        return Ok(());
+                        return parse_escape(read, validate, scratch);
                     }
                     read.discard();
 

From cb4a2517b2a8b20449fe0923a9fb9133bacc1b4e Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Wed, 24 Nov 2021 21:12:52 -0800
Subject: [PATCH 4/8] Document why the parse_escape recursion is not dangerous

---
 src/read.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/read.rs b/src/read.rs
index f2396b81c..6cb6780f8 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -907,7 +907,10 @@ fn parse_escape<'de, R: Read<'de>>(
                         ]);
 
                         // The \ prior to this byte started an escape sequence,
-                        // so we need to parse that now.
+                        // so we need to parse that now. This recursive call
+                        // does not blow the stack on malicious input because
+                        // the escape is not \u, so it will be handled by one
+                        // of the easy nonrecursive cases.
                         return parse_escape(read, validate, scratch);
                     }
                     read.discard();

From 11d3464f1cd9e48aabb3630773a03da845838c7a Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Wed, 24 Nov 2021 21:14:03 -0800
Subject: [PATCH 5/8] Extract common logic of surrogate encode

---
 src/read.rs | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/read.rs b/src/read.rs
index 6cb6780f8..321f16cd4 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -858,17 +858,21 @@ fn parse_escape<'de, R: Read<'de>>(
         b'r' => scratch.push(b'\r'),
         b't' => scratch.push(b'\t'),
         b'u' => {
+            fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
+                scratch.extend_from_slice(&[
+                    (n >> 12 & 0x0F) as u8 | 0b1110_0000,
+                    (n >> 6 & 0x3F) as u8 | 0b1000_0000,
+                    (n & 0x3F) as u8 | 0b1000_0000,
+                ]);
+            }
+
             let c = match tri!(read.decode_hex_escape()) {
                 n @ 0xDC00..=0xDFFF => {
                     if validate {
                         return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
                     }
 
-                    scratch.extend_from_slice(&[
-                        (n >> 12 & 0x0F) as u8 | 0b1110_0000,
-                        (n >> 6 & 0x3F) as u8 | 0b1000_0000,
-                        (n & 0x3F) as u8 | 0b1000_0000,
-                    ]);
+                    encode_surrogate(scratch, n);
 
                     return Ok(());
                 }
@@ -884,11 +888,7 @@ fn parse_escape<'de, R: Read<'de>>(
                             return error(read, ErrorCode::UnexpectedEndOfHexEscape);
                         }
 
-                        scratch.extend_from_slice(&[
-                            (n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
-                            (n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
-                            (n1 & 0x3F) as u8 | 0b1000_0000,
-                        ]);
+                        encode_surrogate(scratch, n1);
 
                         return Ok(());
                     }
@@ -900,11 +900,7 @@ fn parse_escape<'de, R: Read<'de>>(
                             return error(read, ErrorCode::UnexpectedEndOfHexEscape);
                         }
 
-                        scratch.extend_from_slice(&[
-                            (n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
-                            (n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
-                            (n1 & 0x3F) as u8 | 0b1000_0000,
-                        ]);
+                        encode_surrogate(scratch, n1);
 
                         // The \ prior to this byte started an escape sequence,
                         // so we need to parse that now. This recursive call

From 311f185d8e5d3c37ed284c692d0b2e493c38fb33 Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Wed, 24 Nov 2021 21:15:24 -0800
Subject: [PATCH 6/8] Use binary mask to line up visually with the bits being
 |'d in

---
 src/read.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/read.rs b/src/read.rs
index 321f16cd4..50111b605 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -860,9 +860,9 @@ fn parse_escape<'de, R: Read<'de>>(
         b'u' => {
             fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
                 scratch.extend_from_slice(&[
-                    (n >> 12 & 0x0F) as u8 | 0b1110_0000,
-                    (n >> 6 & 0x3F) as u8 | 0b1000_0000,
-                    (n & 0x3F) as u8 | 0b1000_0000,
+                    (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
+                    (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
+                    (n & 0b0011_1111) as u8 | 0b1000_0000,
                 ]);
             }
 

From 142207623acd222a650511df39ffbcc75ad87a42 Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Wed, 24 Nov 2021 21:16:58 -0800
Subject: [PATCH 7/8] Rearrange the early return on lone or encoded surrogate

---
 src/read.rs | 49 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/read.rs b/src/read.rs
index 50111b605..4b355fd43 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -868,13 +868,12 @@ fn parse_escape<'de, R: Read<'de>>(
 
             let c = match tri!(read.decode_hex_escape()) {
                 n @ 0xDC00..=0xDFFF => {
-                    if validate {
-                        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
-                    }
-
-                    encode_surrogate(scratch, n);
-
-                    return Ok(());
+                    return if validate {
+                        error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
+                    } else {
+                        encode_surrogate(scratch, n);
+                        Ok(())
+                    };
                 }
 
                 // Non-BMP characters are encoded as a sequence of two hex
@@ -883,31 +882,29 @@ fn parse_escape<'de, R: Read<'de>>(
                 // whereas deserializing a byte string accepts lone surrogates.
                 n1 @ 0xD800..=0xDBFF => {
                     if tri!(peek_or_eof(read)) != b'\\' {
-                        if validate {
+                        return if validate {
                             read.discard();
-                            return error(read, ErrorCode::UnexpectedEndOfHexEscape);
-                        }
-
-                        encode_surrogate(scratch, n1);
-
-                        return Ok(());
+                            error(read, ErrorCode::UnexpectedEndOfHexEscape)
+                        } else {
+                            encode_surrogate(scratch, n1);
+                            Ok(())
+                        };
                     }
                     read.discard();
 
                     if tri!(peek_or_eof(read)) != b'u' {
-                        if validate {
+                        return if validate {
                             read.discard();
-                            return error(read, ErrorCode::UnexpectedEndOfHexEscape);
-                        }
-
-                        encode_surrogate(scratch, n1);
-
-                        // The \ prior to this byte started an escape sequence,
-                        // so we need to parse that now. This recursive call
-                        // does not blow the stack on malicious input because
-                        // the escape is not \u, so it will be handled by one
-                        // of the easy nonrecursive cases.
-                        return parse_escape(read, validate, scratch);
+                            error(read, ErrorCode::UnexpectedEndOfHexEscape)
+                        } else {
+                            encode_surrogate(scratch, n1);
+                            // The \ prior to this byte started an escape sequence,
+                            // so we need to parse that now. This recursive call
+                            // does not blow the stack on malicious input because
+                            // the escape is not \u, so it will be handled by one
+                            // of the easy nonrecursive cases.
+                            parse_escape(read, validate, scratch)
+                        };
                     }
                     read.discard();
 

From 265fb7ee40d8ef25cb05acc9dc4735621cb1341e Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Wed, 24 Nov 2021 21:17:43 -0800
Subject: [PATCH 8/8] Move discard of expected byte immediately after peek

---
 src/read.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/read.rs b/src/read.rs
index 4b355fd43..2621f3f3f 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -881,7 +881,9 @@ fn parse_escape<'de, R: Read<'de>>(
                 // utf-8 string the surrogates are required to be paired,
                 // whereas deserializing a byte string accepts lone surrogates.
                 n1 @ 0xD800..=0xDBFF => {
-                    if tri!(peek_or_eof(read)) != b'\\' {
+                    if tri!(peek_or_eof(read)) == b'\\' {
+                        read.discard();
+                    } else {
                         return if validate {
                             read.discard();
                             error(read, ErrorCode::UnexpectedEndOfHexEscape)
@@ -890,9 +892,10 @@ fn parse_escape<'de, R: Read<'de>>(
                             Ok(())
                         };
                     }
-                    read.discard();
 
-                    if tri!(peek_or_eof(read)) != b'u' {
+                    if tri!(peek_or_eof(read)) == b'u' {
+                        read.discard();
+                    } else {
                         return if validate {
                             read.discard();
                             error(read, ErrorCode::UnexpectedEndOfHexEscape)
@@ -906,7 +909,6 @@ fn parse_escape<'de, R: Read<'de>>(
                             parse_escape(read, validate, scratch)
                         };
                     }
-                    read.discard();
 
                     let n2 = tri!(read.decode_hex_escape());