Add compatibility with rfc2822 comments (#733)

* Add compatibility with rfc2822 comments * Refactor rfc2822 comment parser
chronotope · Jul 24, 2022 · acd4ecf · acd4ecf
1 parent 187819f
commit acd4ecf
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -34,6 +34,7 @@ Versions with only mechanical changes will be omitted from the following list.
 * Remove libc dependency from Cargo.toml.
 * Add the `and_local_timezone` method to `NaiveDateTime`
 * Fix the behavior of `Duration::abs()` for negative durations with non-zero nanos
+* Add compatibility with rfc2822 comments (#733)
 
 ## 0.4.19
 

diff --git a/src/format/parse.rs b/src/format/parse.rs
@@ -53,7 +53,10 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st
 
     // an adapted RFC 2822 syntax from Section 3.3 and 4.3:
     //
-    // date-time   = [ day-of-week "," ] date 1*S time *S
+    // c-char      = <any char except '(', ')' and '\\'>
+    // c-escape    = "\" <any char>
+    // comment     = "(" *(comment / c-char / c-escape) ")" *S
+    // date-time   = [ day-of-week "," ] date 1*S time *S *comment
     // day-of-week = *S day-name *S
     // day-name    = "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
     // date        = day month year
@@ -79,9 +82,10 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st
     //
     // - we do not recognize a folding white space (FWS) or comment (CFWS).
     //   for our purposes, instead, we accept any sequence of Unicode
-    //   white space characters (denoted here to `S`). any actual RFC 2822
-    //   parser is expected to parse FWS and/or CFWS themselves and replace
-    //   it with a single SP (`%x20`); this is legitimate.
+    //   white space characters (denoted here to `S`). For comments, we accept
+    //   any text within parentheses while respecting escaped parentheses.
+    //   Any actual RFC 2822 parser is expected to parse FWS and/or CFWS themselves
+    //   and replace it with a single SP (`%x20`); this is legitimate.
     //
     // - two-digit year < 50 should be interpreted by adding 2000.
     //   two-digit year >= 50 or three-digit year should be interpreted
@@ -145,6 +149,14 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st
         parsed.set_offset(i64::from(offset))?;
     }
 
+    // optional comments
+    s = s.trim_left();
+    while let Ok((s_out, ())) = scan::comment_2822(s) {
+        // Trim left after every found comment, as comments are allowed to have whitespace
+        // between them
+        s = s_out.trim_left();
+    }
+
     Ok((s, ()))
 }
 
@@ -817,6 +829,12 @@ fn test_rfc2822() {
         ("Tue, 20 Jan 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // normal case
         ("Fri,  2 Jan 2015 17:35:20 -0800", Ok("Fri, 02 Jan 2015 17:35:20 -0800")), // folding whitespace
         ("Fri, 02 Jan 2015 17:35:20 -0800", Ok("Fri, 02 Jan 2015 17:35:20 -0800")), // leading zero
+        ("Tue, 20 Jan 2015 17:35:20 -0800 (UTC)", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // trailing comment
+        (
+            "Tue, 20 Jan 2015 17:35:20 -0800 ( (UTC ) (\\( (a)\\(( \\t ) ) \\\\( \\) ))",
+            Ok("Tue, 20 Jan 2015 17:35:20 -0800"),
+        ), // complex trailing comment
+        ("Tue, 20 Jan 2015 17:35:20 -0800 (UTC\\)", Err(TOO_LONG)), // incorrect comment, not enough closing parentheses
         ("20 Jan 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // no day of week
         ("20 JAN 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // upper case month
         ("Tue, 20 Jan 2015 17:35 -0800", Ok("Tue, 20 Jan 2015 17:35:00 -0800")), // no second

diff --git a/src/format/scan.rs b/src/format/scan.rs
@@ -348,3 +348,76 @@ pub(super) fn timezone_offset_2822(s: &str) -> ParseResult<(&str, Option<i32>)>
 pub(super) fn timezone_name_skip(s: &str) -> ParseResult<(&str, ())> {
     Ok((s.trim_left_matches(|c: char| !c.is_whitespace()), ()))
 }
+
+/// Tries to consume an RFC2822 comment
+pub(super) fn comment_2822(mut s: &str) -> ParseResult<(&str, ())> {
+    macro_rules! next_char {
+        () => {{
+            let c = s.bytes().nth(0).ok_or(TOO_SHORT)?;
+            s = &s[1..];
+            c
+        }};
+    }
+
+    // Make sure the first letter is a `(`
+    if b'(' != next_char!() {
+        Err(INVALID)?;
+    }
+
+    let mut depth = 1; // start with 1 as we already encountered a '('
+    loop {
+        match next_char!() {
+            // If we encounter `\`, ignore the next character as it is escaped.
+            b'\\' => {
+                next_char!();
+            }
+
+            // If we encounter `(`, open a parantheses context.
+            b'(' => {
+                depth += 1;
+            }
+
+            // If we encounter `)`, close a parentheses context.
+            // If all are closed, we found the end of the comment.
+            b')' => {
+                depth -= 1;
+                if depth == 0 {
+                    break;
+                }
+            }
+
+            // Ignore all other characters
+            _ => (),
+        };
+    }
+
+    Ok((s, ()))
+}
+
+#[cfg(test)]
+#[test]
+fn test_rfc2822_comments() {
+    let testdata = [
+        ("", Err(TOO_SHORT)),
+        ("x", Err(INVALID)),
+        ("(", Err(TOO_SHORT)),
+        ("()", Ok("")),
+        ("()z", Ok("z")),
+        ("(x)", Ok("")),
+        ("(())", Ok("")),
+        ("((()))", Ok("")),
+        ("(x(x(x)x)x)", Ok("")),
+        ("( x ( x ( x ) x ) x )", Ok("")),
+        ("(\\)", Err(TOO_SHORT)),
+        ("(\\()", Ok("")),
+        ("(\\))", Ok("")),
+        ("(\\\\)", Ok("")),
+        ("(()())", Ok("")),
+        ("( x ( x ) x ( x ) x )", Ok("")),
+    ];
+
+    for (test_in, expected) in testdata {
+        let actual = comment_2822(test_in).map(|(s, _)| s);
+        assert_eq!(expected, actual);
+    }
+}