From b77077f3bdba1e1e9369409e10b0f1e5f08b4066 Mon Sep 17 00:00:00 2001 From: Finomnis Date: Sat, 9 Jul 2022 18:20:13 +0200 Subject: [PATCH 1/2] Add compatibility with rfc2822 comments --- CHANGELOG.md | 1 + src/format/parse.rs | 26 +++++++++++++--- src/format/scan.rs | 75 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ac5ebcb85..768011d77d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ Versions with only mechanical changes will be omitted from the following list. * Remove libc dependency from Cargo.toml. * Add the `and_local_timezone` method to `NaiveDateTime` * Fix the behavior of `Duration::abs()` for negative durations with non-zero nanos +* Add compatibility with rfc2822 comments (#733) ## 0.4.19 diff --git a/src/format/parse.rs b/src/format/parse.rs index e57fb8a9d2..f84a515dc5 100644 --- a/src/format/parse.rs +++ b/src/format/parse.rs @@ -53,7 +53,10 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st // an adapted RFC 2822 syntax from Section 3.3 and 4.3: // - // date-time = [ day-of-week "," ] date 1*S time *S + // c-char = + // c-escape = "\" + // comment = "(" *(comment / c-char / c-escape) ")" *S + // date-time = [ day-of-week "," ] date 1*S time *S *comment // day-of-week = *S day-name *S // day-name = "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun" // date = day month year @@ -79,9 +82,10 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st // // - we do not recognize a folding white space (FWS) or comment (CFWS). // for our purposes, instead, we accept any sequence of Unicode - // white space characters (denoted here to `S`). any actual RFC 2822 - // parser is expected to parse FWS and/or CFWS themselves and replace - // it with a single SP (`%x20`); this is legitimate. + // white space characters (denoted here to `S`). For comments, we accept + // any text within parentheses while respecting escaped parentheses. + // Any actual RFC 2822 parser is expected to parse FWS and/or CFWS themselves + // and replace it with a single SP (`%x20`); this is legitimate. // // - two-digit year < 50 should be interpreted by adding 2000. // two-digit year >= 50 or three-digit year should be interpreted @@ -145,6 +149,14 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st parsed.set_offset(i64::from(offset))?; } + // optional comments + s = s.trim_left(); + while let Ok((s_out, ())) = scan::comment_2822(s) { + // Trim left after every found comment, as comments are allowed to have whitespace + // between them + s = s_out.trim_left(); + } + Ok((s, ())) } @@ -817,6 +829,12 @@ fn test_rfc2822() { ("Tue, 20 Jan 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // normal case ("Fri, 2 Jan 2015 17:35:20 -0800", Ok("Fri, 02 Jan 2015 17:35:20 -0800")), // folding whitespace ("Fri, 02 Jan 2015 17:35:20 -0800", Ok("Fri, 02 Jan 2015 17:35:20 -0800")), // leading zero + ("Tue, 20 Jan 2015 17:35:20 -0800 (UTC)", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // trailing comment + ( + "Tue, 20 Jan 2015 17:35:20 -0800 ( (UTC ) (\\( (a)\\(( \\t ) ) \\\\( \\) ))", + Ok("Tue, 20 Jan 2015 17:35:20 -0800"), + ), // complex trailing comment + ("Tue, 20 Jan 2015 17:35:20 -0800 (UTC\\)", Err(TOO_LONG)), // incorrect comment, not enough closing parentheses ("20 Jan 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // no day of week ("20 JAN 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // upper case month ("Tue, 20 Jan 2015 17:35 -0800", Ok("Tue, 20 Jan 2015 17:35:00 -0800")), // no second diff --git a/src/format/scan.rs b/src/format/scan.rs index 856fa2f567..ce574ca7db 100644 --- a/src/format/scan.rs +++ b/src/format/scan.rs @@ -348,3 +348,78 @@ pub(super) fn timezone_offset_2822(s: &str) -> ParseResult<(&str, Option)> pub(super) fn timezone_name_skip(s: &str) -> ParseResult<(&str, ())> { Ok((s.trim_left_matches(|c: char| !c.is_whitespace()), ())) } + +/// Tries to consume an RFC2822 comment +pub(super) fn comment_2822(mut s: &str) -> ParseResult<(&str, ())> { + macro_rules! next_char { + () => {{ + s.bytes().nth(0).map(|c| { + s = &s[1..]; + c + }) + }}; + } + + // Make sure the first letter is a `(` + match next_char!() { + None => Err(TOO_SHORT), + Some(b'(') => Ok(()), + Some(_) => Err(INVALID), + }?; + + let mut depth = 1; // start with 1 as we already encountered a '(' + loop { + match next_char!() { + // If we ran out of characters, then we are still inside of a `()` but missing the `)`. + None => Err(TOO_SHORT), + // If we encounter a `\`, ignore the next character as it is escaped. + Some(b'\\') => next_char!().map(|_| ()).ok_or(TOO_SHORT), + // If we encounter `(`, open a parantheses context. + Some(b'(') => { + depth += 1; + Ok(()) + } + // If we encounter `)`, close a parentheses context. + // If all are closed, we found the end of the comment. + Some(b')') => { + depth -= 1; + if depth == 0 { + break; + } + Ok(()) + } + // Ignore all other characters + Some(_) => Ok(()), + }?; + } + + Ok((s, ())) +} + +#[cfg(test)] +#[test] +fn test_rfc2822_comments() { + let testdata = [ + ("", Err(TOO_SHORT)), + ("x", Err(INVALID)), + ("(", Err(TOO_SHORT)), + ("()", Ok("")), + ("()z", Ok("z")), + ("(x)", Ok("")), + ("(())", Ok("")), + ("((()))", Ok("")), + ("(x(x(x)x)x)", Ok("")), + ("( x ( x ( x ) x ) x )", Ok("")), + ("(\\)", Err(TOO_SHORT)), + ("(\\()", Ok("")), + ("(\\))", Ok("")), + ("(\\\\)", Ok("")), + ("(()())", Ok("")), + ("( x ( x ) x ( x ) x )", Ok("")), + ]; + + for (test_in, expected) in testdata { + let actual = comment_2822(test_in).map(|(s, _)| s); + assert_eq!(expected, actual); + } +} From dd10eac730c2b0a0a96aaf97443b37e3193ee73e Mon Sep 17 00:00:00 2001 From: Finomnis Date: Sat, 9 Jul 2022 18:57:11 +0200 Subject: [PATCH 2/2] Refactor rfc2822 comment parser --- src/format/scan.rs | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/src/format/scan.rs b/src/format/scan.rs index ce574ca7db..674ac33ba7 100644 --- a/src/format/scan.rs +++ b/src/format/scan.rs @@ -353,44 +353,42 @@ pub(super) fn timezone_name_skip(s: &str) -> ParseResult<(&str, ())> { pub(super) fn comment_2822(mut s: &str) -> ParseResult<(&str, ())> { macro_rules! next_char { () => {{ - s.bytes().nth(0).map(|c| { - s = &s[1..]; - c - }) + let c = s.bytes().nth(0).ok_or(TOO_SHORT)?; + s = &s[1..]; + c }}; } // Make sure the first letter is a `(` - match next_char!() { - None => Err(TOO_SHORT), - Some(b'(') => Ok(()), - Some(_) => Err(INVALID), - }?; + if b'(' != next_char!() { + Err(INVALID)?; + } let mut depth = 1; // start with 1 as we already encountered a '(' loop { match next_char!() { - // If we ran out of characters, then we are still inside of a `()` but missing the `)`. - None => Err(TOO_SHORT), - // If we encounter a `\`, ignore the next character as it is escaped. - Some(b'\\') => next_char!().map(|_| ()).ok_or(TOO_SHORT), + // If we encounter `\`, ignore the next character as it is escaped. + b'\\' => { + next_char!(); + } + // If we encounter `(`, open a parantheses context. - Some(b'(') => { + b'(' => { depth += 1; - Ok(()) } + // If we encounter `)`, close a parentheses context. // If all are closed, we found the end of the comment. - Some(b')') => { + b')' => { depth -= 1; if depth == 0 { break; } - Ok(()) } + // Ignore all other characters - Some(_) => Ok(()), - }?; + _ => (), + }; } Ok((s, ()))