From 51e9616deeb0ab1f42a481201aaf28043de76207 Mon Sep 17 00:00:00 2001 From: Luca Casonato Date: Thu, 25 Nov 2021 14:41:09 +0100 Subject: [PATCH] Allow lone surrogates in raw values --- src/read.rs | 35 ++++++++--------------------------- tests/test.rs | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/src/read.rs b/src/read.rs index 7bad5708a..f6efbb791 100644 --- a/src/read.rs +++ b/src/read.rs @@ -951,34 +951,15 @@ where match ch { b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {} - b'u' => match tri!(read.decode_hex_escape()) { - 0xDC00..=0xDFFF => { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } - - // Non-BMP characters are encoded as a sequence of - // two hex escapes, representing UTF-16 surrogates. - n1 @ 0xD800..=0xDBFF => { - if tri!(next_or_eof(read)) != b'\\' { - return error(read, ErrorCode::UnexpectedEndOfHexEscape); - } - if tri!(next_or_eof(read)) != b'u' { - return error(read, ErrorCode::UnexpectedEndOfHexEscape); - } - - let n2 = tri!(read.decode_hex_escape()); - if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } - - let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; - if char::from_u32(n).is_none() { - return error(read, ErrorCode::InvalidUnicodeCodePoint); - } - } + b'u' => { + // At this point we don't care if the codepoint is valid. We just + // want to consume it. We don't actually know what is valid or not + // at this point, because that depends on if this string will + // ultimately be parsed into a string or a byte buffer in the "real" + // parse. - _ => {} - }, + tri!(read.decode_hex_escape()); + } _ => { return error(read, ErrorCode::InvalidEscape); } diff --git a/tests/test.rs b/tests/test.rs index 636053004..fa96e812d 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1742,6 +1742,20 @@ fn test_byte_buf_de_lone_surrogate() { assert!(res.is_err()); } +#[cfg(feature = "raw_value")] +#[test] +fn test_raw_de_lone_surrogate() { + use serde_json::value::RawValue; + + assert!(from_str::>(r#""\ud83c""#).is_ok()); + assert!(from_str::>(r#""\ud83c\n""#).is_ok()); + assert!(from_str::>(r#""\ud83c ""#).is_ok()); + assert!(from_str::>(r#""\udc01 ""#).is_ok()); + assert!(from_str::>(r#""\udc01\!""#).is_err()); + assert!(from_str::>(r#""\udc01\u""#).is_err()); + assert!(from_str::>(r#""\ud83c\ud83c""#).is_ok()); +} + #[test] fn test_byte_buf_de_multiple() { let s: Vec = from_str(r#"["ab\nc", "cd\ne"]"#).unwrap();