From b77077f3bdba1e1e9369409e10b0f1e5f08b4066 Mon Sep 17 00:00:00 2001
From: Finomnis <finomnis@gmail.com>
Date: Sat, 9 Jul 2022 18:20:13 +0200
Subject: [PATCH 1/2] Add compatibility with rfc2822 comments

---
 CHANGELOG.md        |  1 +
 src/format/parse.rs | 26 +++++++++++++---
 src/format/scan.rs  | 75 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 98 insertions(+), 4 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ac5ebcb85..768011d77d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -34,6 +34,7 @@ Versions with only mechanical changes will be omitted from the following list.
 * Remove libc dependency from Cargo.toml.
 * Add the `and_local_timezone` method to `NaiveDateTime`
 * Fix the behavior of `Duration::abs()` for negative durations with non-zero nanos
+* Add compatibility with rfc2822 comments (#733)
 
 ## 0.4.19
 
diff --git a/src/format/parse.rs b/src/format/parse.rs
index e57fb8a9d2..f84a515dc5 100644
--- a/src/format/parse.rs
+++ b/src/format/parse.rs
@@ -53,7 +53,10 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st
 
     // an adapted RFC 2822 syntax from Section 3.3 and 4.3:
     //
-    // date-time   = [ day-of-week "," ] date 1*S time *S
+    // c-char      = <any char except '(', ')' and '\\'>
+    // c-escape    = "\" <any char>
+    // comment     = "(" *(comment / c-char / c-escape) ")" *S
+    // date-time   = [ day-of-week "," ] date 1*S time *S *comment
     // day-of-week = *S day-name *S
     // day-name    = "Mon" / "Tue" / "Wed" / "Thu" / "Fri" / "Sat" / "Sun"
     // date        = day month year
@@ -79,9 +82,10 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st
     //
     // - we do not recognize a folding white space (FWS) or comment (CFWS).
     //   for our purposes, instead, we accept any sequence of Unicode
-    //   white space characters (denoted here to `S`). any actual RFC 2822
-    //   parser is expected to parse FWS and/or CFWS themselves and replace
-    //   it with a single SP (`%x20`); this is legitimate.
+    //   white space characters (denoted here to `S`). For comments, we accept
+    //   any text within parentheses while respecting escaped parentheses.
+    //   Any actual RFC 2822 parser is expected to parse FWS and/or CFWS themselves
+    //   and replace it with a single SP (`%x20`); this is legitimate.
     //
     // - two-digit year < 50 should be interpreted by adding 2000.
     //   two-digit year >= 50 or three-digit year should be interpreted
@@ -145,6 +149,14 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st
         parsed.set_offset(i64::from(offset))?;
     }
 
+    // optional comments
+    s = s.trim_left();
+    while let Ok((s_out, ())) = scan::comment_2822(s) {
+        // Trim left after every found comment, as comments are allowed to have whitespace
+        // between them
+        s = s_out.trim_left();
+    }
+
     Ok((s, ()))
 }
 
@@ -817,6 +829,12 @@ fn test_rfc2822() {
         ("Tue, 20 Jan 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // normal case
         ("Fri,  2 Jan 2015 17:35:20 -0800", Ok("Fri, 02 Jan 2015 17:35:20 -0800")), // folding whitespace
         ("Fri, 02 Jan 2015 17:35:20 -0800", Ok("Fri, 02 Jan 2015 17:35:20 -0800")), // leading zero
+        ("Tue, 20 Jan 2015 17:35:20 -0800 (UTC)", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // trailing comment
+        (
+            "Tue, 20 Jan 2015 17:35:20 -0800 ( (UTC ) (\\( (a)\\(( \\t ) ) \\\\( \\) ))",
+            Ok("Tue, 20 Jan 2015 17:35:20 -0800"),
+        ), // complex trailing comment
+        ("Tue, 20 Jan 2015 17:35:20 -0800 (UTC\\)", Err(TOO_LONG)), // incorrect comment, not enough closing parentheses
         ("20 Jan 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // no day of week
         ("20 JAN 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // upper case month
         ("Tue, 20 Jan 2015 17:35 -0800", Ok("Tue, 20 Jan 2015 17:35:00 -0800")), // no second
diff --git a/src/format/scan.rs b/src/format/scan.rs
index 856fa2f567..ce574ca7db 100644
--- a/src/format/scan.rs
+++ b/src/format/scan.rs
@@ -348,3 +348,78 @@ pub(super) fn timezone_offset_2822(s: &str) -> ParseResult<(&str, Option<i32>)>
 pub(super) fn timezone_name_skip(s: &str) -> ParseResult<(&str, ())> {
     Ok((s.trim_left_matches(|c: char| !c.is_whitespace()), ()))
 }
+
+/// Tries to consume an RFC2822 comment
+pub(super) fn comment_2822(mut s: &str) -> ParseResult<(&str, ())> {
+    macro_rules! next_char {
+        () => {{
+            s.bytes().nth(0).map(|c| {
+                s = &s[1..];
+                c
+            })
+        }};
+    }
+
+    // Make sure the first letter is a `(`
+    match next_char!() {
+        None => Err(TOO_SHORT),
+        Some(b'(') => Ok(()),
+        Some(_) => Err(INVALID),
+    }?;
+
+    let mut depth = 1; // start with 1 as we already encountered a '('
+    loop {
+        match next_char!() {
+            // If we ran out of characters, then we are still inside of a `()` but missing the `)`.
+            None => Err(TOO_SHORT),
+            // If we encounter a `\`, ignore the next character as it is escaped.
+            Some(b'\\') => next_char!().map(|_| ()).ok_or(TOO_SHORT),
+            // If we encounter `(`, open a parantheses context.
+            Some(b'(') => {
+                depth += 1;
+                Ok(())
+            }
+            // If we encounter `)`, close a parentheses context.
+            // If all are closed, we found the end of the comment.
+            Some(b')') => {
+                depth -= 1;
+                if depth == 0 {
+                    break;
+                }
+                Ok(())
+            }
+            // Ignore all other characters
+            Some(_) => Ok(()),
+        }?;
+    }
+
+    Ok((s, ()))
+}
+
+#[cfg(test)]
+#[test]
+fn test_rfc2822_comments() {
+    let testdata = [
+        ("", Err(TOO_SHORT)),
+        ("x", Err(INVALID)),
+        ("(", Err(TOO_SHORT)),
+        ("()", Ok("")),
+        ("()z", Ok("z")),
+        ("(x)", Ok("")),
+        ("(())", Ok("")),
+        ("((()))", Ok("")),
+        ("(x(x(x)x)x)", Ok("")),
+        ("( x ( x ( x ) x ) x )", Ok("")),
+        ("(\\)", Err(TOO_SHORT)),
+        ("(\\()", Ok("")),
+        ("(\\))", Ok("")),
+        ("(\\\\)", Ok("")),
+        ("(()())", Ok("")),
+        ("( x ( x ) x ( x ) x )", Ok("")),
+    ];
+
+    for (test_in, expected) in testdata {
+        let actual = comment_2822(test_in).map(|(s, _)| s);
+        assert_eq!(expected, actual);
+    }
+}

From dd10eac730c2b0a0a96aaf97443b37e3193ee73e Mon Sep 17 00:00:00 2001
From: Finomnis <finomnis@gmail.com>
Date: Sat, 9 Jul 2022 18:57:11 +0200
Subject: [PATCH 2/2] Refactor rfc2822 comment parser

---
 src/format/scan.rs | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/format/scan.rs b/src/format/scan.rs
index ce574ca7db..674ac33ba7 100644
--- a/src/format/scan.rs
+++ b/src/format/scan.rs
@@ -353,44 +353,42 @@ pub(super) fn timezone_name_skip(s: &str) -> ParseResult<(&str, ())> {
 pub(super) fn comment_2822(mut s: &str) -> ParseResult<(&str, ())> {
     macro_rules! next_char {
         () => {{
-            s.bytes().nth(0).map(|c| {
-                s = &s[1..];
-                c
-            })
+            let c = s.bytes().nth(0).ok_or(TOO_SHORT)?;
+            s = &s[1..];
+            c
         }};
     }
 
     // Make sure the first letter is a `(`
-    match next_char!() {
-        None => Err(TOO_SHORT),
-        Some(b'(') => Ok(()),
-        Some(_) => Err(INVALID),
-    }?;
+    if b'(' != next_char!() {
+        Err(INVALID)?;
+    }
 
     let mut depth = 1; // start with 1 as we already encountered a '('
     loop {
         match next_char!() {
-            // If we ran out of characters, then we are still inside of a `()` but missing the `)`.
-            None => Err(TOO_SHORT),
-            // If we encounter a `\`, ignore the next character as it is escaped.
-            Some(b'\\') => next_char!().map(|_| ()).ok_or(TOO_SHORT),
+            // If we encounter `\`, ignore the next character as it is escaped.
+            b'\\' => {
+                next_char!();
+            }
+
             // If we encounter `(`, open a parantheses context.
-            Some(b'(') => {
+            b'(' => {
                 depth += 1;
-                Ok(())
             }
+
             // If we encounter `)`, close a parentheses context.
             // If all are closed, we found the end of the comment.
-            Some(b')') => {
+            b')' => {
                 depth -= 1;
                 if depth == 0 {
                     break;
                 }
-                Ok(())
             }
+
             // Ignore all other characters
-            Some(_) => Ok(()),
-        }?;
+            _ => (),
+        };
     }
 
     Ok((s, ()))