Skip to content

Commit

Permalink
Allow lone surrogates in raw values
Browse files Browse the repository at this point in the history
  • Loading branch information
lucacasonato committed Nov 25, 2021
1 parent 76e376c commit 5ec8141
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 29 deletions.
35 changes: 6 additions & 29 deletions src/read.rs
Expand Up @@ -955,36 +955,13 @@ where
match ch {
b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {}
b'u' => {
let n = match tri!(read.decode_hex_escape()) {
0xDC00..=0xDFFF => {
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
}

// Non-BMP characters are encoded as a sequence of
// two hex escapes, representing UTF-16 surrogates.
n1 @ 0xD800..=0xDBFF => {
if tri!(next_or_eof(read)) != b'\\' {
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
}
if tri!(next_or_eof(read)) != b'u' {
return error(read, ErrorCode::UnexpectedEndOfHexEscape);
}

let n2 = tri!(read.decode_hex_escape());

if n2 < 0xDC00 || n2 > 0xDFFF {
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
}

(((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000
}

n => n as u32,
};
// At this point we don't care if the codepoint is valid. We just
// want to consume it. We don't actually know what is valid or not
// at this point, because that depends on if this string will
// ultimately be parsed into a string or a byte buffer in the "real"
// parse.

if char::from_u32(n).is_none() {
return error(read, ErrorCode::InvalidUnicodeCodePoint);
}
tri!(read.decode_hex_escape());
}
_ => {
return error(read, ErrorCode::InvalidEscape);
Expand Down
14 changes: 14 additions & 0 deletions tests/test.rs
Expand Up @@ -1742,6 +1742,20 @@ fn test_byte_buf_de_lone_surrogate() {
assert!(res.is_err());
}

#[cfg(feature = "raw_value")]
#[test]
fn test_raw_de_lone_surrogate() {
use serde_json::value::RawValue;

assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
assert!(from_str::<Box<RawValue>>(r#""\ud83c\n""#).is_ok());
assert!(from_str::<Box<RawValue>>(r#""\ud83c ""#).is_ok());
assert!(from_str::<Box<RawValue>>(r#""\udc01 ""#).is_ok());
assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
}

#[test]
fn test_byte_buf_de_multiple() {
let s: Vec<ByteBuf> = from_str(r#"["ab\nc", "cd\ne"]"#).unwrap();
Expand Down

0 comments on commit 5ec8141

Please sign in to comment.