Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Touch up PR 828 #829

Merged
merged 8 commits into from Nov 25, 2021
18 changes: 8 additions & 10 deletions src/de.rs
Expand Up @@ -1560,7 +1560,8 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
///
/// # Examples
///
/// You can use this to parse JSON strings containing invalid UTF-8 bytes.
/// You can use this to parse JSON strings containing invalid UTF-8 bytes,
/// or unpaired surrogates.
///
/// ```
/// use serde_bytes::ByteBuf;
Expand All @@ -1580,21 +1581,18 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
/// ```
///
/// Backslash escape sequences like `\n` are still interpreted and required
/// to be valid. `\u` escape sequences are required to represent valid
/// Unicode code points, except in the case of lone surrogates.
/// to be valid. `\u` escape sequences are required to represent a valid
/// Unicode code point or lone surrogate.
///
/// ```
/// use serde_bytes::ByteBuf;
///
/// fn look_at_bytes() {
/// fn look_at_bytes() -> Result<(), serde_json::Error> {
/// let json_data = b"\"lone surrogate: \\uD801\"";
/// let parsed: Result<ByteBuf, _> = serde_json::from_slice(json_data);
///
/// assert!(parsed.is_ok());
///
/// let bytes: ByteBuf = serde_json::from_slice(json_data)?;
/// let expected = b"lone surrogate: \xED\xA0\x81";
/// let bytes: ByteBuf = parsed.unwrap();
/// assert_eq!(expected, &bytes[..]);
/// assert_eq!(expected, bytes.as_slice());
/// Ok(())
/// }
/// #
/// # look_at_bytes();
Expand Down
93 changes: 41 additions & 52 deletions src/read.rs
Expand Up @@ -858,68 +858,57 @@ fn parse_escape<'de, R: Read<'de>>(
b'r' => scratch.push(b'\r'),
b't' => scratch.push(b'\t'),
b'u' => {
fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
scratch.extend_from_slice(&[
(n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
(n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
(n & 0b0011_1111) as u8 | 0b1000_0000,
]);
}

let c = match tri!(read.decode_hex_escape()) {
n @ 0xDC00..=0xDFFF => {
if validate {
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
}

let utf8_bytes = [
(n >> 12 & 0x0F) as u8 | 0b1110_0000,
(n >> 6 & 0x3F) as u8 | 0b1000_0000,
(n & 0x3F) as u8 | 0b1000_0000,
];

scratch.extend_from_slice(&utf8_bytes);

return Ok(());
return if validate {
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
} else {
encode_surrogate(scratch, n);
Ok(())
};
}

// Non-BMP characters are encoded as a sequence of
// two hex escapes, representing UTF-16 surrogates.
// If `validate` is false and we only find a single
// hex escape that is a surrogate, then we'll accept
// it instead of erroring.
// Non-BMP characters are encoded as a sequence of two hex
// escapes, representing UTF-16 surrogates. If deserializing a
// utf-8 string the surrogates are required to be paired,
// whereas deserializing a byte string accepts lone surrogates.
n1 @ 0xD800..=0xDBFF => {
if tri!(peek_or_eof(read)) != b'\\' {
if validate {
if tri!(peek_or_eof(read)) == b'\\' {
read.discard();
} else {
return if validate {
read.discard();
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
}

let utf8_bytes = [
(n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
(n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
(n1 & 0x3F) as u8 | 0b1000_0000,
];

scratch.extend_from_slice(&utf8_bytes);

return Ok(());
error(read, ErrorCode::UnexpectedEndOfHexEscape)
} else {
encode_surrogate(scratch, n1);
Ok(())
};
}
read.discard();

if tri!(peek_or_eof(read)) != b'u' {
if validate {
if tri!(peek_or_eof(read)) == b'u' {
read.discard();
} else {
return if validate {
read.discard();
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
}

let utf8_bytes = [
(n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
(n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
(n1 & 0x3F) as u8 | 0b1000_0000,
];

scratch.extend_from_slice(&utf8_bytes);

// The \ prior to this byte started an escape sequence,
// so we need to parse that now.
parse_escape(read, validate, scratch)?;

return Ok(());
error(read, ErrorCode::UnexpectedEndOfHexEscape)
} else {
encode_surrogate(scratch, n1);
// The \ prior to this byte started an escape sequence,
// so we need to parse that now. This recursive call
// does not blow the stack on malicious input because
// the escape is not \u, so it will be handled by one
// of the easy nonrecursive cases.
parse_escape(read, validate, scratch)
};
}
read.discard();

let n2 = tri!(read.decode_hex_escape());

Expand Down