Skip to content

Commit

Permalink
Merge pull request #829 from serde-rs/surrogate
Browse files Browse the repository at this point in the history
Touch up PR 828
  • Loading branch information
dtolnay committed Nov 25, 2021
2 parents 691466c + 265fb7e commit 77915eb
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 62 deletions.
18 changes: 8 additions & 10 deletions src/de.rs
Expand Up @@ -1560,7 +1560,8 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
///
/// # Examples
///
/// You can use this to parse JSON strings containing invalid UTF-8 bytes.
/// You can use this to parse JSON strings containing invalid UTF-8 bytes,
/// or unpaired surrogates.
///
/// ```
/// use serde_bytes::ByteBuf;
Expand All @@ -1580,21 +1581,18 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
/// ```
///
/// Backslash escape sequences like `\n` are still interpreted and required
/// to be valid. `\u` escape sequences are required to represent valid
/// Unicode code points, except in the case of lone surrogates.
/// to be valid. `\u` escape sequences are required to represent a valid
/// Unicode code point or lone surrogate.
///
/// ```
/// use serde_bytes::ByteBuf;
///
/// fn look_at_bytes() {
/// fn look_at_bytes() -> Result<(), serde_json::Error> {
/// let json_data = b"\"lone surrogate: \\uD801\"";
/// let parsed: Result<ByteBuf, _> = serde_json::from_slice(json_data);
///
/// assert!(parsed.is_ok());
///
/// let bytes: ByteBuf = serde_json::from_slice(json_data)?;
/// let expected = b"lone surrogate: \xED\xA0\x81";
/// let bytes: ByteBuf = parsed.unwrap();
/// assert_eq!(expected, &bytes[..]);
/// assert_eq!(expected, bytes.as_slice());
/// Ok(())
/// }
/// #
/// # look_at_bytes();
Expand Down
93 changes: 41 additions & 52 deletions src/read.rs
Expand Up @@ -858,68 +858,57 @@ fn parse_escape<'de, R: Read<'de>>(
b'r' => scratch.push(b'\r'),
b't' => scratch.push(b'\t'),
b'u' => {
fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
scratch.extend_from_slice(&[
(n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
(n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
(n & 0b0011_1111) as u8 | 0b1000_0000,
]);
}

let c = match tri!(read.decode_hex_escape()) {
n @ 0xDC00..=0xDFFF => {
if validate {
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
}

let utf8_bytes = [
(n >> 12 & 0x0F) as u8 | 0b1110_0000,
(n >> 6 & 0x3F) as u8 | 0b1000_0000,
(n & 0x3F) as u8 | 0b1000_0000,
];

scratch.extend_from_slice(&utf8_bytes);

return Ok(());
return if validate {
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
} else {
encode_surrogate(scratch, n);
Ok(())
};
}

// Non-BMP characters are encoded as a sequence of
// two hex escapes, representing UTF-16 surrogates.
// If `validate` is false and we only find a single
// hex escape that is a surrogate, then we'll accept
// it instead of erroring.
// Non-BMP characters are encoded as a sequence of two hex
// escapes, representing UTF-16 surrogates. If deserializing a
// utf-8 string the surrogates are required to be paired,
// whereas deserializing a byte string accepts lone surrogates.
n1 @ 0xD800..=0xDBFF => {
if tri!(peek_or_eof(read)) != b'\\' {
if validate {
if tri!(peek_or_eof(read)) == b'\\' {
read.discard();
} else {
return if validate {
read.discard();
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
}

let utf8_bytes = [
(n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
(n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
(n1 & 0x3F) as u8 | 0b1000_0000,
];

scratch.extend_from_slice(&utf8_bytes);

return Ok(());
error(read, ErrorCode::UnexpectedEndOfHexEscape)
} else {
encode_surrogate(scratch, n1);
Ok(())
};
}
read.discard();

if tri!(peek_or_eof(read)) != b'u' {
if validate {
if tri!(peek_or_eof(read)) == b'u' {
read.discard();
} else {
return if validate {
read.discard();
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
}

let utf8_bytes = [
(n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
(n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
(n1 & 0x3F) as u8 | 0b1000_0000,
];

scratch.extend_from_slice(&utf8_bytes);

// The \ prior to this byte started an escape sequence,
// so we need to parse that now.
parse_escape(read, validate, scratch)?;

return Ok(());
error(read, ErrorCode::UnexpectedEndOfHexEscape)
} else {
encode_surrogate(scratch, n1);
// The \ prior to this byte started an escape sequence,
// so we need to parse that now. This recursive call
// does not blow the stack on malicious input because
// the escape is not \u, so it will be handled by one
// of the easy nonrecursive cases.
parse_escape(read, validate, scratch)
};
}
read.discard();

let n2 = tri!(read.decode_hex_escape());

Expand Down

0 comments on commit 77915eb

Please sign in to comment.