From f50e29656a52466fe61256bf7bd9acf9b650a50a Mon Sep 17 00:00:00 2001
From: Luca Casonato <hello@lcas.dev>
Date: Tue, 12 Apr 2022 02:08:35 +0200
Subject: [PATCH] Deserialize invalid UTF-8 into byte bufs as WTF-8

Previously #828 added support for deserializing lone leading and
trailing surrogates into WTF-8 encoded bytes when deserializing a string
as bytes. This commit extends this to cover the case of a leading
surrogate followed by code units that are not trailing surrogates. This
allows for deserialization of "\ud83c\ud83c" (two leading surrogates),
or  "\ud83c\u0061" (a leading surrogate followed by "a").

The docs also now make it clear that we are serializing the invalid code
points as WTF-8. This reference to WTF-8 signals to the user that they
can use a WTF-8 parser on the bytes to construct a valid UTF-8 string.
---
 src/de.rs     |  5 ++++-
 src/read.rs   | 41 +++++++++++++++++++++++++++-----------
 tests/test.rs | 54 +++++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 80 insertions(+), 20 deletions(-)
diff --git a/src/de.rs b/src/de.rs
index ffd0d48c2..2b6daf46f 100644
--- a/src/de.rs
+++ b/src/de.rs
@@ -1570,7 +1570,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
     ///
     /// The behavior of serde_json is specified to fail on non-UTF-8 strings
     /// when deserializing into Rust UTF-8 string types such as String, and
-    /// succeed with non-UTF-8 bytes when deserializing using this method.
+    /// succeed with the bytes representing the [WTF-8] encoding of code points
+    /// when deserializing using this method.
+    /// 
+    /// [WTF-8]: https://simonsapin.github.io/wtf-8
     ///
     /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
     /// still checked if the hex number represents a valid Unicode code point.
diff --git a/src/read.rs b/src/read.rs
index 1319d89c9..f3774e563 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -861,20 +861,33 @@ fn parse_escape<'de, R: Read<'de>>(
         b'r' => scratch.push(b'\r'),
         b't' => scratch.push(b'\t'),
         b'u' => {
-            fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
-                scratch.extend_from_slice(&[
-                    (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
-                    (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
-                    (n & 0b0011_1111) as u8 | 0b1000_0000,
-                ]);
+            fn encode_wtf8(scratch: &mut Vec<u8>, cp: u16) {
+                match cp {
+                    0x0000..=0x007F => {
+                        scratch.extend_from_slice(&[cp as u8]);
+                    }
+                    0x0080..=0x07FF => {
+                        scratch
+                            .extend_from_slice(&[0xC0 | (cp >> 6) as u8, 0x80 | (cp & 0x3F) as u8]);
+                    }
+                    0x0800..=0xFFFF => {
+                        scratch.extend_from_slice(&[
+                            0xE0 | (cp >> 12) as u8,
+                            0x80 | ((cp >> 6) & 0x3F) as u8,
+                            0x80 | (cp & 0x3F) as u8,
+                        ]);
+                    }
+                }
             }
 
             let c = match tri!(read.decode_hex_escape()) {
                 n @ 0xDC00..=0xDFFF => {
                     return if validate {
+                        // TODO: the error message is wrong, this is a lone
+                        // _trailing_ surrogate
                         error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
                     } else {
-                        encode_surrogate(scratch, n);
+                        encode_wtf8(scratch, n);
                         Ok(())
                     };
                 }
@@ -889,9 +902,9 @@ fn parse_escape<'de, R: Read<'de>>(
                     } else {
                         return if validate {
                             read.discard();
-                            error(read, ErrorCode::UnexpectedEndOfHexEscape)
+                            error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
                         } else {
-                            encode_surrogate(scratch, n1);
+                            encode_wtf8(scratch, n1);
                             Ok(())
                         };
                     }
@@ -903,7 +916,7 @@ fn parse_escape<'de, R: Read<'de>>(
                             read.discard();
                             error(read, ErrorCode::UnexpectedEndOfHexEscape)
                         } else {
-                            encode_surrogate(scratch, n1);
+                            encode_wtf8(scratch, n1);
                             // The \ prior to this byte started an escape sequence,
                             // so we need to parse that now. This recursive call
                             // does not blow the stack on malicious input because
@@ -916,7 +929,13 @@ fn parse_escape<'de, R: Read<'de>>(
                     let n2 = tri!(read.decode_hex_escape());
 
                     if n2 < 0xDC00 || n2 > 0xDFFF {
-                        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
+                        return if validate {
+                            error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
+                        } else {
+                            encode_wtf8(scratch, n1);
+                            encode_wtf8(scratch, n2);
+                            Ok(())
+                        };
                     }
 
                     let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
diff --git a/tests/test.rs b/tests/test.rs
index b11635e75..7c5d5a78e 100644
--- a/tests/test.rs
+++ b/tests/test.rs
@@ -1713,7 +1713,8 @@ fn test_byte_buf_de() {
 }
 
 #[test]
-fn test_byte_buf_de_lone_surrogate() {
+fn test_byte_buf_de_invalid_surrogates() {
+    // lone leading surrogate
     let bytes = ByteBuf::from(vec![237, 160, 188]);
     let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
     assert_eq!(v, bytes);
@@ -1726,23 +1727,49 @@ fn test_byte_buf_de_lone_surrogate() {
     let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
     assert_eq!(v, bytes);
 
-    let bytes = ByteBuf::from(vec![237, 176, 129]);
-    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
-    assert_eq!(v, bytes);
-
     let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
     assert!(res.is_err());
 
     let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
     assert!(res.is_err());
 
-    let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
-    assert!(res.is_err());
+    // lone trailing surrogate
+    let bytes = ByteBuf::from(vec![237, 176, 129]);
+    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by other leading surrogate
+    let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
+    let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by "a" (U+0061) in \u encoding
+    let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
+    let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by U+0080
+    let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by U+FFFF
+    let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
+    let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
+    assert_eq!(v, bytes);
+}
+
+#[test]
+fn test_byte_buf_de_surrogate_pair() {
+    // leading surrogate followed by trailing surrogate
+    let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
+    assert_eq!(v, bytes);
 }
 
 #[cfg(feature = "raw_value")]
 #[test]
-fn test_raw_de_lone_surrogate() {
+fn test_raw_de_invalid_surrogates() {
     use serde_json::value::RawValue;
 
     assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
@@ -1752,6 +1779,17 @@ fn test_raw_de_lone_surrogate() {
     assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
     assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
     assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
+}
+
+#[cfg(feature = "raw_value")]
+#[test]
+fn test_raw_de_surrogate_pair() {
+    use serde_json::value::RawValue;
+
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
 }
 
 #[test]