diff --git a/src/read.rs b/src/read.rs index 2621f3f3f..fff690573 100644 --- a/src/read.rs +++ b/src/read.rs @@ -955,36 +955,13 @@ where match ch { b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {} b'u' => { - let n = match tri!(read.decode_hex_escape()) { - 0xDC00..=0xDFFF => { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } - - // Non-BMP characters are encoded as a sequence of - // two hex escapes, representing UTF-16 surrogates. - n1 @ 0xD800..=0xDBFF => { - if tri!(next_or_eof(read)) != b'\\' { - return error(read, ErrorCode::UnexpectedEndOfHexEscape); - } - if tri!(next_or_eof(read)) != b'u' { - return error(read, ErrorCode::UnexpectedEndOfHexEscape); - } - - let n2 = tri!(read.decode_hex_escape()); - - if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } - - (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000 - } - - n => n as u32, - }; + // At this point we don't care if the codepoint is valid. We just + // want to consume it. We don't actually know what is valid or not + // at this point, because that depends on if this string will + // ultimately be parsed into a string or a byte buffer in the "real" + // parse. - if char::from_u32(n).is_none() { - return error(read, ErrorCode::InvalidUnicodeCodePoint); - } + tri!(read.decode_hex_escape()); } _ => { return error(read, ErrorCode::InvalidEscape); diff --git a/tests/test.rs b/tests/test.rs index 636053004..fa96e812d 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1742,6 +1742,20 @@ fn test_byte_buf_de_lone_surrogate() { assert!(res.is_err()); } +#[cfg(feature = "raw_value")] +#[test] +fn test_raw_de_lone_surrogate() { + use serde_json::value::RawValue; + + assert!(from_str::>(r#""\ud83c""#).is_ok()); + assert!(from_str::>(r#""\ud83c\n""#).is_ok()); + assert!(from_str::>(r#""\ud83c ""#).is_ok()); + assert!(from_str::>(r#""\udc01 ""#).is_ok()); + assert!(from_str::>(r#""\udc01\!""#).is_err()); + assert!(from_str::>(r#""\udc01\u""#).is_err()); + assert!(from_str::>(r#""\ud83c\ud83c""#).is_ok()); +} + #[test] fn test_byte_buf_de_multiple() { let s: Vec = from_str(r#"["ab\nc", "cd\ne"]"#).unwrap();