Skip to content

Commit

Permalink
Add compatibility with rfc2822 comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Finomnis committed Jul 9, 2022
1 parent 051e117 commit 096d53d
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -33,6 +33,7 @@ Versions with only mechanical changes will be omitted from the following list.
* Add support for getting week bounds based on a specific `NaiveDate` and a `Weekday` (#666)
* Remove libc dependency from Cargo.toml.
* Add the `and_local_timezone` method to `NaiveDateTime`
* Add compatibility with rfc2822 comments (#733)

## 0.4.19

Expand Down
26 changes: 22 additions & 4 deletions src/format/parse.rs
Expand Up @@ -53,7 +53,10 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st

// an adapted RFC 2822 syntax from Section 3.3 and 4.3:
//
// date-time = [ day-of-week "," ] date 1*S time *S
// c-char = <any char except '(', ')' and '\\'>
// c-escape = "\" <any char>
// comment = *S "(" *(comment / c-char / c-escape) ")" *S
// date-time = [ day-of-week "," ] date 1*S time [ *comment ]
// day-of-week = *S day-name *S
// day-name = "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
// date = day month year
Expand All @@ -79,9 +82,10 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st
//
// - we do not recognize a folding white space (FWS) or comment (CFWS).
// for our purposes, instead, we accept any sequence of Unicode
// white space characters (denoted here to `S`). any actual RFC 2822
// parser is expected to parse FWS and/or CFWS themselves and replace
// it with a single SP (`%x20`); this is legitimate.
// white space characters (denoted here to `S`). For comments, we accept
// any text within parentheses while respecting escaped parentheses.
// Any actual RFC 2822 parser is expected to parse FWS and/or CFWS themselves
// and replace it with a single SP (`%x20`); this is legitimate.
//
// - two-digit year < 50 should be interpreted by adding 2000.
// two-digit year >= 50 or three-digit year should be interpreted
Expand Down Expand Up @@ -145,6 +149,14 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st
parsed.set_offset(i64::from(offset))?;
}

// optional comments
s = s.trim_left();
while let Ok((s_out, ())) = scan::comment_2822(s) {
// Trim left after every found comment, as comments are allowed to have whitespace
// between them
s = s_out.trim_left();
}

Ok((s, ()))
}

Expand Down Expand Up @@ -817,6 +829,12 @@ fn test_rfc2822() {
("Tue, 20 Jan 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // normal case
("Fri, 2 Jan 2015 17:35:20 -0800", Ok("Fri, 02 Jan 2015 17:35:20 -0800")), // folding whitespace
("Fri, 02 Jan 2015 17:35:20 -0800", Ok("Fri, 02 Jan 2015 17:35:20 -0800")), // leading zero
("Tue, 20 Jan 2015 17:35:20 -0800 (UTC)", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // trailing comment
(
"Tue, 20 Jan 2015 17:35:20 -0800 ( (UTC ) (\\( (a)\\(( \\t ) ) \\\\( \\) ))",
Ok("Tue, 20 Jan 2015 17:35:20 -0800"),
), // complex trailing comment
("Tue, 20 Jan 2015 17:35:20 -0800 (UTC\\)", Err(TOO_LONG)), // incorrect comment, not enough closing parentheses
("20 Jan 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // no day of week
("20 JAN 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // upper case month
("Tue, 20 Jan 2015 17:35 -0800", Ok("Tue, 20 Jan 2015 17:35:00 -0800")), // no second
Expand Down
75 changes: 75 additions & 0 deletions src/format/scan.rs
Expand Up @@ -348,3 +348,78 @@ pub(super) fn timezone_offset_2822(s: &str) -> ParseResult<(&str, Option<i32>)>
pub(super) fn timezone_name_skip(s: &str) -> ParseResult<(&str, ())> {
Ok((s.trim_left_matches(|c: char| !c.is_whitespace()), ()))
}

/// Tries to consume an RFC2822 comment
pub(super) fn comment_2822(mut s: &str) -> ParseResult<(&str, ())> {
macro_rules! next_char {
() => {{
s.bytes().nth(0).map(|c| {
s = &s[1..];
c
})
}};
}

// Make sure the first letter is a `(`
match next_char!() {
None => Err(TOO_SHORT),
Some(b'(') => Ok(()),
Some(_) => Err(INVALID),
}?;

let mut depth = 1; // start with 1 as we already encountered a '('
loop {
match next_char!() {
// If we ran out of characters, then we are still inside of a `()` but missing the `)`.
None => Err(TOO_SHORT),
// If we encounter a `\`, ignore the next character as it is escaped.
Some(b'\\') => next_char!().map(|_| ()).ok_or(TOO_SHORT),
// If we encounter `(`, open a parantheses context.
Some(b'(') => {
depth += 1;
Ok(())
}
// If we encounter `)`, close a parentheses context.
// If all are closed, we found the end of the comment.
Some(b')') => {
depth -= 1;
if depth == 0 {
break;
}
Ok(())
}
// Ignore all other characters
Some(_) => Ok(()),
}?;
}

Ok((s, ()))
}

#[cfg(test)]
#[test]
fn test_rfc2822_comments() {
let testdata = [
("", Err(TOO_SHORT)),
("x", Err(INVALID)),
("(", Err(TOO_SHORT)),
("()", Ok("")),
("()z", Ok("z")),
("(x)", Ok("")),
("(())", Ok("")),
("((()))", Ok("")),
("(x(x(x)x)x)", Ok("")),
("( x ( x ( x ) x ) x )", Ok("")),
("(\\)", Err(TOO_SHORT)),
("(\\()", Ok("")),
("(\\))", Ok("")),
("(\\\\)", Ok("")),
("(()())", Ok("")),
("( x ( x ) x ( x ) x )", Ok("")),
];

for (test_in, expected) in testdata {
let actual = comment_2822(test_in).map(|(s, _)| s);
assert_eq!(expected, actual);
}
}

0 comments on commit 096d53d

Please sign in to comment.