Skip to content

Commit

Permalink
Merge pull request #828 from lucacasonato/lone_surrogate
Browse files Browse the repository at this point in the history
Deserialize lone surrogates into byte bufs
  • Loading branch information
dtolnay committed Nov 25, 2021
2 parents 33c3134 + 07c740c commit 691466c
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 15 deletions.
13 changes: 7 additions & 6 deletions src/de.rs
Expand Up @@ -1580,20 +1580,21 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
/// ```
///
/// Backslash escape sequences like `\n` are still interpreted and required
/// to be valid, and `\u` escape sequences are required to represent valid
/// Unicode code points.
/// to be valid. `\u` escape sequences are required to represent valid
/// Unicode code points, except in the case of lone surrogates.
///
/// ```
/// use serde_bytes::ByteBuf;
///
/// fn look_at_bytes() {
/// let json_data = b"\"invalid unicode surrogate: \\uD801\"";
/// let json_data = b"\"lone surrogate: \\uD801\"";
/// let parsed: Result<ByteBuf, _> = serde_json::from_slice(json_data);
///
/// assert!(parsed.is_err());
/// assert!(parsed.is_ok());
///
/// let expected_msg = "unexpected end of hex escape at line 1 column 35";
/// assert_eq!(expected_msg, parsed.unwrap_err().to_string());
/// let expected = b"lone surrogate: \xED\xA0\x81";
/// let bytes: ByteBuf = parsed.unwrap();
/// assert_eq!(expected, &bytes[..]);
/// }
/// #
/// # look_at_bytes();
Expand Down
80 changes: 71 additions & 9 deletions src/read.rs
Expand Up @@ -225,7 +225,7 @@ where
return result(self, scratch);
}
b'\\' => {
tri!(parse_escape(self, scratch));
tri!(parse_escape(self, validate, scratch));
}
_ => {
if validate {
Expand Down Expand Up @@ -465,7 +465,7 @@ impl<'a> SliceRead<'a> {
b'\\' => {
scratch.extend_from_slice(&self.slice[start..self.index]);
self.index += 1;
tri!(parse_escape(self, scratch));
tri!(parse_escape(self, validate, scratch));
start = self.index;
}
_ => {
Expand Down Expand Up @@ -817,6 +817,16 @@ where
}
}

fn peek_or_eof<'de, R>(read: &mut R) -> Result<u8>
where
R: ?Sized + Read<'de>,
{
match tri!(read.peek()) {
Some(b) => Ok(b),
None => error(read, ErrorCode::EofWhileParsingString),
}
}

fn error<'de, R, T>(read: &R, reason: ErrorCode) -> Result<T>
where
R: ?Sized + Read<'de>,
Expand All @@ -831,7 +841,11 @@ fn as_str<'de, 's, R: Read<'de>>(read: &R, slice: &'s [u8]) -> Result<&'s str> {

/// Parses a JSON escape sequence and appends it into the scratch space. Assumes
/// the previous byte read was a backslash.
fn parse_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec<u8>) -> Result<()> {
fn parse_escape<'de, R: Read<'de>>(
read: &mut R,
validate: bool,
scratch: &mut Vec<u8>,
) -> Result<()> {
let ch = tri!(next_or_eof(read));

match ch {
Expand All @@ -845,19 +859,67 @@ fn parse_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec<u8>) -> Resul
b't' => scratch.push(b'\t'),
b'u' => {
let c = match tri!(read.decode_hex_escape()) {
0xDC00..=0xDFFF => {
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
n @ 0xDC00..=0xDFFF => {
if validate {
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
}

let utf8_bytes = [
(n >> 12 & 0x0F) as u8 | 0b1110_0000,
(n >> 6 & 0x3F) as u8 | 0b1000_0000,
(n & 0x3F) as u8 | 0b1000_0000,
];

scratch.extend_from_slice(&utf8_bytes);

return Ok(());
}

// Non-BMP characters are encoded as a sequence of
// two hex escapes, representing UTF-16 surrogates.
// If `validate` is false and we only find a single
// hex escape that is a surrogate, then we'll accept
// it instead of erroring.
n1 @ 0xD800..=0xDBFF => {
if tri!(next_or_eof(read)) != b'\\' {
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
if tri!(peek_or_eof(read)) != b'\\' {
if validate {
read.discard();
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
}

let utf8_bytes = [
(n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
(n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
(n1 & 0x3F) as u8 | 0b1000_0000,
];

scratch.extend_from_slice(&utf8_bytes);

return Ok(());
}
if tri!(next_or_eof(read)) != b'u' {
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
read.discard();

if tri!(peek_or_eof(read)) != b'u' {
if validate {
read.discard();
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
}

let utf8_bytes = [
(n1 >> 12 & 0x0F) as u8 | 0b1110_0000,
(n1 >> 6 & 0x3F) as u8 | 0b1000_0000,
(n1 & 0x3F) as u8 | 0b1000_0000,
];

scratch.extend_from_slice(&utf8_bytes);

// The \ prior to this byte started an escape sequence,
// so we need to parse that now.
parse_escape(read, validate, scratch)?;

return Ok(());
}
read.discard();

let n2 = tri!(read.decode_hex_escape());

Expand Down
28 changes: 28 additions & 0 deletions tests/test.rs
Expand Up @@ -1714,6 +1714,34 @@ fn test_byte_buf_de() {
assert_eq!(v, bytes);
}

#[test]
fn test_byte_buf_de_lone_surrogate() {
let bytes = ByteBuf::from(vec![237, 160, 188]);
let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
assert_eq!(v, bytes);

let bytes = ByteBuf::from(vec![237, 160, 188, 10]);
let v: ByteBuf = from_str(r#""\ud83c\n""#).unwrap();
assert_eq!(v, bytes);

let bytes = ByteBuf::from(vec![237, 160, 188, 32]);
let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
assert_eq!(v, bytes);

let bytes = ByteBuf::from(vec![237, 176, 129]);
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
assert_eq!(v, bytes);

let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
assert!(res.is_err());

let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
assert!(res.is_err());

let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
assert!(res.is_err());
}

#[test]
fn test_byte_buf_de_multiple() {
let s: Vec<ByteBuf> = from_str(r#"["ab\nc", "cd\ne"]"#).unwrap();
Expand Down

0 comments on commit 691466c

Please sign in to comment.