diff --git a/src/de.rs b/src/de.rs index a2f34b908..d9a5fee8c 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1580,20 +1580,21 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { /// ``` /// /// Backslash escape sequences like `\n` are still interpreted and required - /// to be valid, and `\u` escape sequences are required to represent valid - /// Unicode code points. + /// to be valid. `\u` escape sequences are required to represent valid + /// Unicode code points, except in the case of lone surrogates. /// /// ``` /// use serde_bytes::ByteBuf; /// /// fn look_at_bytes() { - /// let json_data = b"\"invalid unicode surrogate: \\uD801\""; + /// let json_data = b"\"lone surrogate: \\uD801\""; /// let parsed: Result = serde_json::from_slice(json_data); /// - /// assert!(parsed.is_err()); + /// assert!(parsed.is_ok()); /// - /// let expected_msg = "unexpected end of hex escape at line 1 column 35"; - /// assert_eq!(expected_msg, parsed.unwrap_err().to_string()); + /// let expected = b"lone surrogate: \xED\xA0\x81"; + /// let bytes: ByteBuf = parsed.unwrap(); + /// assert_eq!(expected, &bytes[..]); /// } /// # /// # look_at_bytes(); diff --git a/src/read.rs b/src/read.rs index 4e883c68b..034cc6557 100644 --- a/src/read.rs +++ b/src/read.rs @@ -225,7 +225,7 @@ where return result(self, scratch); } b'\\' => { - tri!(parse_escape(self, scratch)); + tri!(parse_escape(self, validate, scratch)); } _ => { if validate { @@ -465,7 +465,7 @@ impl<'a> SliceRead<'a> { b'\\' => { scratch.extend_from_slice(&self.slice[start..self.index]); self.index += 1; - tri!(parse_escape(self, scratch)); + tri!(parse_escape(self, validate, scratch)); start = self.index; } _ => { @@ -817,6 +817,16 @@ where } } +fn peek_or_eof<'de, R>(read: &mut R) -> Result +where + R: ?Sized + Read<'de>, +{ + match tri!(read.peek()) { + Some(b) => Ok(b), + None => error(read, ErrorCode::EofWhileParsingString), + } +} + fn error<'de, R, T>(read: &R, reason: ErrorCode) -> Result where R: ?Sized + Read<'de>, @@ -831,7 +841,11 @@ fn as_str<'de, 's, R: Read<'de>>(read: &R, slice: &'s [u8]) -> Result<&'s str> { /// Parses a JSON escape sequence and appends it into the scratch space. Assumes /// the previous byte read was a backslash. -fn parse_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec) -> Result<()> { +fn parse_escape<'de, R: Read<'de>>( + read: &mut R, + validate: bool, + scratch: &mut Vec, +) -> Result<()> { let ch = tri!(next_or_eof(read)); match ch { @@ -845,19 +859,67 @@ fn parse_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec) -> Resul b't' => scratch.push(b'\t'), b'u' => { let c = match tri!(read.decode_hex_escape()) { - 0xDC00..=0xDFFF => { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + n @ 0xDC00..=0xDFFF => { + if validate { + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + } + + let utf8_bytes = [ + (n >> 12 & 0x0F) as u8 | 0b1110_0000, + (n >> 6 & 0x3F) as u8 | 0b1000_0000, + (n & 0x3F) as u8 | 0b1000_0000, + ]; + + scratch.extend_from_slice(&utf8_bytes); + + return Ok(()); } // Non-BMP characters are encoded as a sequence of // two hex escapes, representing UTF-16 surrogates. + // If `validate` is false and we only find a single + // hex escape that is a surrogate, then we'll accept + // it instead of erroring. n1 @ 0xD800..=0xDBFF => { - if tri!(next_or_eof(read)) != b'\\' { - return error(read, ErrorCode::UnexpectedEndOfHexEscape); + if tri!(peek_or_eof(read)) != b'\\' { + if validate { + read.discard(); + return error(read, ErrorCode::UnexpectedEndOfHexEscape); + } + + let utf8_bytes = [ + (n1 >> 12 & 0x0F) as u8 | 0b1110_0000, + (n1 >> 6 & 0x3F) as u8 | 0b1000_0000, + (n1 & 0x3F) as u8 | 0b1000_0000, + ]; + + scratch.extend_from_slice(&utf8_bytes); + + return Ok(()); } - if tri!(next_or_eof(read)) != b'u' { - return error(read, ErrorCode::UnexpectedEndOfHexEscape); + read.discard(); + + if tri!(peek_or_eof(read)) != b'u' { + if validate { + read.discard(); + return error(read, ErrorCode::UnexpectedEndOfHexEscape); + } + + let utf8_bytes = [ + (n1 >> 12 & 0x0F) as u8 | 0b1110_0000, + (n1 >> 6 & 0x3F) as u8 | 0b1000_0000, + (n1 & 0x3F) as u8 | 0b1000_0000, + ]; + + scratch.extend_from_slice(&utf8_bytes); + + // The \ prior to this byte started an escape sequence, + // so we need to parse that now. + parse_escape(read, validate, scratch)?; + + return Ok(()); } + read.discard(); let n2 = tri!(read.decode_hex_escape()); diff --git a/tests/test.rs b/tests/test.rs index 4b7540540..636053004 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1714,6 +1714,34 @@ fn test_byte_buf_de() { assert_eq!(v, bytes); } +#[test] +fn test_byte_buf_de_lone_surrogate() { + let bytes = ByteBuf::from(vec![237, 160, 188]); + let v: ByteBuf = from_str(r#""\ud83c""#).unwrap(); + assert_eq!(v, bytes); + + let bytes = ByteBuf::from(vec![237, 160, 188, 10]); + let v: ByteBuf = from_str(r#""\ud83c\n""#).unwrap(); + assert_eq!(v, bytes); + + let bytes = ByteBuf::from(vec![237, 160, 188, 32]); + let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap(); + assert_eq!(v, bytes); + + let bytes = ByteBuf::from(vec![237, 176, 129]); + let v: ByteBuf = from_str(r#""\udc01""#).unwrap(); + assert_eq!(v, bytes); + + let res = from_str::(r#""\ud83c\!""#); + assert!(res.is_err()); + + let res = from_str::(r#""\ud83c\u""#); + assert!(res.is_err()); + + let res = from_str::(r#""\ud83c\ud83c""#); + assert!(res.is_err()); +} + #[test] fn test_byte_buf_de_multiple() { let s: Vec = from_str(r#"["ab\nc", "cd\ne"]"#).unwrap();