Skip to content

Commit

Permalink
Deserialize invalid UTF-8 into byte bufs as WTF-8
Browse files Browse the repository at this point in the history
Previously serde-rs#828 added support for deserializing lone leading and
trailing surrogates into WTF-8 encoded bytes when deserializing a string
as bytes. This commit extends this to cover the case of a leading
surrogate followed by code units that are not trailing surrogates. This
allows for deserialization of "\ud83c\ud83c" (two leading surrogates),
or  "\ud83c\u0061" (a leading surrogate followed by "a").

The docs also now make it clear that we are serializing the invalid code
points as WTF-8. This reference to WTF-8 signals to the user that they
can use a WTF-8 parser on the bytes to construct a valid UTF-8 string.
  • Loading branch information
lucacasonato committed Apr 12, 2022
1 parent 829175e commit c3bfa51
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 20 deletions.
5 changes: 4 additions & 1 deletion src/de.rs
Expand Up @@ -1570,7 +1570,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
///
/// The behavior of serde_json is specified to fail on non-UTF-8 strings
/// when deserializing into Rust UTF-8 string types such as String, and
/// succeed with non-UTF-8 bytes when deserializing using this method.
/// succeed with the bytes representing the [WTF-8] encoding of code points
/// when deserializing using this method.
///
/// [WTF-8]: https://simonsapin.github.io/wtf-8
///
/// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
/// still checked if the hex number represents a valid Unicode code point.
Expand Down
41 changes: 30 additions & 11 deletions src/read.rs
Expand Up @@ -861,20 +861,33 @@ fn parse_escape<'de, R: Read<'de>>(
b'r' => scratch.push(b'\r'),
b't' => scratch.push(b'\t'),
b'u' => {
fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
scratch.extend_from_slice(&[
(n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
(n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
(n & 0b0011_1111) as u8 | 0b1000_0000,
]);
fn encode_wtf8(scratch: &mut Vec<u8>, cp: u16) {
match cp {
0x0000..=0x007F => {
scratch.extend_from_slice(&[cp as u8]);
}
0x0080..=0x07FF => {
scratch
.extend_from_slice(&[0xC0 | (cp >> 6) as u8, 0x80 | (cp & 0x3F) as u8]);
}
0x0800..=0xFFFF => {
scratch.extend_from_slice(&[
0xE0 | (cp >> 12) as u8,
0x80 | ((cp >> 6) & 0x3F) as u8,
0x80 | (cp & 0x3F) as u8,
]);
}
}
}

let c = match tri!(read.decode_hex_escape()) {
n @ 0xDC00..=0xDFFF => {
return if validate {
// TODO: the error message is wrong, this is a lone
// _trailing_ surrogate
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
} else {
encode_surrogate(scratch, n);
encode_wtf8(scratch, n);
Ok(())
};
}
Expand All @@ -889,9 +902,9 @@ fn parse_escape<'de, R: Read<'de>>(
} else {
return if validate {
read.discard();
error(read, ErrorCode::UnexpectedEndOfHexEscape)
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
} else {
encode_surrogate(scratch, n1);
encode_wtf8(scratch, n1);
Ok(())
};
}
Expand All @@ -903,7 +916,7 @@ fn parse_escape<'de, R: Read<'de>>(
read.discard();
error(read, ErrorCode::UnexpectedEndOfHexEscape)
} else {
encode_surrogate(scratch, n1);
encode_wtf8(scratch, n1);
// The \ prior to this byte started an escape sequence,
// so we need to parse that now. This recursive call
// does not blow the stack on malicious input because
Expand All @@ -916,7 +929,13 @@ fn parse_escape<'de, R: Read<'de>>(
let n2 = tri!(read.decode_hex_escape());

if n2 < 0xDC00 || n2 > 0xDFFF {
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
return if validate {
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
} else {
encode_wtf8(scratch, n1);
encode_wtf8(scratch, n2);
Ok(())
};
}

let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
Expand Down
54 changes: 46 additions & 8 deletions tests/test.rs
Expand Up @@ -1713,7 +1713,8 @@ fn test_byte_buf_de() {
}

#[test]
fn test_byte_buf_de_lone_surrogate() {
fn test_byte_buf_de_invalid_surrogates() {
// lone leading surrogate
let bytes = ByteBuf::from(vec![237, 160, 188]);
let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
assert_eq!(v, bytes);
Expand All @@ -1726,23 +1727,49 @@ fn test_byte_buf_de_lone_surrogate() {
let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
assert_eq!(v, bytes);

let bytes = ByteBuf::from(vec![237, 176, 129]);
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
assert_eq!(v, bytes);

let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
assert!(res.is_err());

let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
assert!(res.is_err());

let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
assert!(res.is_err());
// lone trailing surrogate
let bytes = ByteBuf::from(vec![237, 176, 129]);
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
assert_eq!(v, bytes);

// leading surrogate followed by other leading surrogate
let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
assert_eq!(v, bytes);

// leading surrogate followed by "a" (U+0061) in \u encoding
let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
assert_eq!(v, bytes);

// leading surrogate followed by U+0080
let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
assert_eq!(v, bytes);

// leading surrogate followed by U+FFFF
let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
assert_eq!(v, bytes);
}

#[test]
fn test_byte_buf_de_surrogate_pair() {
// leading surrogate followed by trailing surrogate
let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
assert_eq!(v, bytes);
}

#[cfg(feature = "raw_value")]
#[test]
fn test_raw_de_lone_surrogate() {
fn test_raw_de_invalid_surrogates() {
use serde_json::value::RawValue;

assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
Expand All @@ -1752,6 +1779,17 @@ fn test_raw_de_lone_surrogate() {
assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
}

#[cfg(feature = "raw_value")]
#[test]
fn test_byte_buf_de_surrogate_pair() {
use serde_json::value::RawValue;

assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
}

#[test]
Expand Down

0 comments on commit c3bfa51

Please sign in to comment.