Merge pull request #829 from serde-rs/surrogate

Touch up PR 828
serde-rs · Nov 25, 2021 · 77915eb · 77915eb
2 parents 691466c + 265fb7e
commit 77915eb
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 62 deletions.
diff --git a/src/de.rs b/src/de.rs
@@ -1560,7 +1560,8 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
     ///
     /// # Examples
     ///
-    /// You can use this to parse JSON strings containing invalid UTF-8 bytes.
+    /// You can use this to parse JSON strings containing invalid UTF-8 bytes,
+    /// or unpaired surrogates.
     ///
     /// ```
     /// use serde_bytes::ByteBuf;
@@ -1580,21 +1581,18 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
     /// ```
     ///
     /// Backslash escape sequences like `\n` are still interpreted and required
-    /// to be valid. `\u` escape sequences are required to represent valid
-    /// Unicode code points, except in the case of lone surrogates.
+    /// to be valid. `\u` escape sequences are required to represent a valid
+    /// Unicode code point or lone surrogate.
     ///
     /// ```
     /// use serde_bytes::ByteBuf;
     ///
-    /// fn look_at_bytes() {
+    /// fn look_at_bytes() -> Result<(), serde_json::Error> {
     ///     let json_data = b"\"lone surrogate: \\uD801\"";
-    ///     let parsed: Result<ByteBuf, _> = serde_json::from_slice(json_data);
-    ///
-    ///     assert!(parsed.is_ok());
-    ///
+    ///     let bytes: ByteBuf = serde_json::from_slice(json_data)?;
     ///     let expected = b"lone surrogate: \xED\xA0\x81";
-    ///     let bytes: ByteBuf = parsed.unwrap();
-    ///     assert_eq!(expected, &bytes[..]);
+    ///     assert_eq!(expected, bytes.as_slice());
+    ///     Ok(())
     /// }
     /// #
     /// # look_at_bytes();

diff --git a/src/read.rs b/src/read.rs
@@ -858,68 +858,57 @@ fn parse_escape<'de, R: Read<'de>>(
         b'r' => scratch.push(b'\r'),
         b't' => scratch.push(b'\t'),
         b'u' => {
+            fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
+                scratch.extend_from_slice(&[
+                    (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
+                    (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
+                    (n & 0b0011_1111) as u8 | 0b1000_0000,
+                ]);
+            }
+
             let c = match tri!(read.decode_hex_escape()) {
                 n @ 0xDC00..=0xDFFF => {
-                    if validate {
-                        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
-                    }
-
-                    let utf8_bytes = [
-                        (n >> 12 & 0x0F) as u8 | 0b1110_0000,
-                        (n >> 6 & 0x3F) as u8 | 0b1000_0000,
-                        (n & 0x3F) as u8 | 0b1000_0000,
-                    ];
-
-                    scratch.extend_from_slice(&utf8_bytes);
-
-                    return Ok(());
+                    return if validate {
+                        error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
+                    } else {
+                        encode_surrogate(scratch, n);
+                        Ok(())
+                    };
                 }
 
-                // Non-BMP characters are encoded as a sequence of
-                // two hex escapes, representing UTF-16 surrogates.
-                // If `validate` is false and we only find a single
-                // hex escape that is a surrogate, then we'll accept
-                // it instead of erroring.
+                // Non-BMP characters are encoded as a sequence of two hex
+                // escapes, representing UTF-16 surrogates. If deserializing a
+                // utf-8 string the surrogates are required to be paired,
+                // whereas deserializing a byte string accepts lone surrogates.
                 n1 @ 0xD800..=0xDBFF => {
-                    if tri!(peek_or_eof(read)) != b'\\' {
-                        if validate {
+                    if tri!(peek_or_eof(read)) == b'\\' {
+                        read.discard();
+                    } else {
+                        return if validate {
                             read.discard();
-                            return error(read, ErrorCode::UnexpectedEndOfHexEscape);
-                        }
-
-                        let utf8_bytes = [
-                            (n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
-                            (n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
-                            (n1 & 0x3F) as u8 | 0b1000_0000,
-                        ];
-
-                        scratch.extend_from_slice(&utf8_bytes);
-
-                        return Ok(());
+                            error(read, ErrorCode::UnexpectedEndOfHexEscape)
+                        } else {
+                            encode_surrogate(scratch, n1);
+                            Ok(())
+                        };
                     }
-                    read.discard();
 
-                    if tri!(peek_or_eof(read)) != b'u' {
-                        if validate {
+                    if tri!(peek_or_eof(read)) == b'u' {
+                        read.discard();
+                    } else {
+                        return if validate {
                             read.discard();
-                            return error(read, ErrorCode::UnexpectedEndOfHexEscape);
-                        }
-
-                        let utf8_bytes = [
-                            (n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
-                            (n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
-                            (n1 & 0x3F) as u8 | 0b1000_0000,
-                        ];
-
-                        scratch.extend_from_slice(&utf8_bytes);
-
-                        // The \ prior to this byte started an escape sequence,
-                        // so we need to parse that now.
-                        parse_escape(read, validate, scratch)?;
-
-                        return Ok(());
+                            error(read, ErrorCode::UnexpectedEndOfHexEscape)
+                        } else {
+                            encode_surrogate(scratch, n1);
+                            // The \ prior to this byte started an escape sequence,
+                            // so we need to parse that now. This recursive call
+                            // does not blow the stack on malicious input because
+                            // the escape is not \u, so it will be handled by one
+                            // of the easy nonrecursive cases.
+                            parse_escape(read, validate, scratch)
+                        };
                     }
-                    read.discard();
 
                     let n2 = tri!(read.decode_hex_escape());