From ed5239f99842c40bd5f188936dde43c8845b4400 Mon Sep 17 00:00:00 2001 From: Dirkjan Ochtman Date: Mon, 25 Jul 2022 13:39:56 +0200 Subject: [PATCH 1/3] Simplify RFC 2822 comment parser --- src/format/parse.rs | 5 +--- src/format/scan.rs | 64 +++++++++++++++++---------------------------- 2 files changed, 25 insertions(+), 44 deletions(-) diff --git a/src/format/parse.rs b/src/format/parse.rs index f84a515dc5..c377614f6a 100644 --- a/src/format/parse.rs +++ b/src/format/parse.rs @@ -150,11 +150,8 @@ fn parse_rfc2822<'a>(parsed: &mut Parsed, mut s: &'a str) -> ParseResult<(&'a st } // optional comments - s = s.trim_left(); while let Ok((s_out, ())) = scan::comment_2822(s) { - // Trim left after every found comment, as comments are allowed to have whitespace - // between them - s = s_out.trim_left(); + s = s_out; } Ok((s, ())) diff --git a/src/format/scan.rs b/src/format/scan.rs index 674ac33ba7..3143e7901e 100644 --- a/src/format/scan.rs +++ b/src/format/scan.rs @@ -349,49 +349,33 @@ pub(super) fn timezone_name_skip(s: &str) -> ParseResult<(&str, ())> { Ok((s.trim_left_matches(|c: char| !c.is_whitespace()), ())) } -/// Tries to consume an RFC2822 comment -pub(super) fn comment_2822(mut s: &str) -> ParseResult<(&str, ())> { - macro_rules! next_char { - () => {{ - let c = s.bytes().nth(0).ok_or(TOO_SHORT)?; - s = &s[1..]; - c - }}; - } - - // Make sure the first letter is a `(` - if b'(' != next_char!() { - Err(INVALID)?; - } - - let mut depth = 1; // start with 1 as we already encountered a '(' - loop { - match next_char!() { - // If we encounter `\`, ignore the next character as it is escaped. - b'\\' => { - next_char!(); - } - - // If we encounter `(`, open a parantheses context. - b'(' => { - depth += 1; - } - - // If we encounter `)`, close a parentheses context. - // If all are closed, we found the end of the comment. - b')' => { - depth -= 1; - if depth == 0 { - break; - } - } - - // Ignore all other characters - _ => (), +/// Tries to consume an RFC2822 comment including preceding ` `. +/// +/// Returns the remaining string after the closing parenthesis. +pub(super) fn comment_2822(s: &str) -> ParseResult<(&str, ())> { + use CommentState::*; + + let mut state = Start; + for (i, c) in s.bytes().enumerate() { + state = match (state, c) { + (Start, b' ') => Start, + (Start, b'(') => Next(1), + (Next(1), b')') => return Ok((&s[i + 1..], ())), + (Next(depth), b'\\') => Escape(depth), + (Next(depth), b'(') => Next(depth + 1), + (Next(depth), b')') => Next(depth - 1), + (Next(depth), _) | (Escape(depth), _) => Next(depth), + _ => return Err(INVALID), }; } - Ok((s, ())) + Err(TOO_SHORT) +} + +enum CommentState { + Start, + Next(usize), + Escape(usize), } #[cfg(test)] From b415ac46b9b8bbb3f22361ca7c6ccb09cc7c9d5c Mon Sep 17 00:00:00 2001 From: Eric Sheppard Date: Mon, 25 Jul 2022 22:35:20 +1000 Subject: [PATCH 2/3] change to raw strings --- src/format/parse.rs | 4 ++-- src/format/scan.rs | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/format/parse.rs b/src/format/parse.rs index c377614f6a..7ff5ca0c35 100644 --- a/src/format/parse.rs +++ b/src/format/parse.rs @@ -828,10 +828,10 @@ fn test_rfc2822() { ("Fri, 02 Jan 2015 17:35:20 -0800", Ok("Fri, 02 Jan 2015 17:35:20 -0800")), // leading zero ("Tue, 20 Jan 2015 17:35:20 -0800 (UTC)", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // trailing comment ( - "Tue, 20 Jan 2015 17:35:20 -0800 ( (UTC ) (\\( (a)\\(( \\t ) ) \\\\( \\) ))", + r"Tue, 20 Jan 2015 17:35:20 -0800 ( (UTC ) (\( (a)\(( \t ) ) \\( \) ))", Ok("Tue, 20 Jan 2015 17:35:20 -0800"), ), // complex trailing comment - ("Tue, 20 Jan 2015 17:35:20 -0800 (UTC\\)", Err(TOO_LONG)), // incorrect comment, not enough closing parentheses + (r"Tue, 20 Jan 2015 17:35:20 -0800 (UTC\)", Err(TOO_LONG)), // incorrect comment, not enough closing parentheses ("20 Jan 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // no day of week ("20 JAN 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // upper case month ("Tue, 20 Jan 2015 17:35 -0800", Ok("Tue, 20 Jan 2015 17:35:00 -0800")), // no second diff --git a/src/format/scan.rs b/src/format/scan.rs index 3143e7901e..cf139c0ad1 100644 --- a/src/format/scan.rs +++ b/src/format/scan.rs @@ -392,10 +392,10 @@ fn test_rfc2822_comments() { ("((()))", Ok("")), ("(x(x(x)x)x)", Ok("")), ("( x ( x ( x ) x ) x )", Ok("")), - ("(\\)", Err(TOO_SHORT)), - ("(\\()", Ok("")), - ("(\\))", Ok("")), - ("(\\\\)", Ok("")), + (r"(\)", Err(TOO_SHORT)), + (r"(\()", Ok("")), + (r"(\))", Ok("")), + (r"(\\)", Ok("")), ("(()())", Ok("")), ("( x ( x ) x ( x ) x )", Ok("")), ]; From 7a5a28f45b1d9c254372bf16a2f2cd267cd269d5 Mon Sep 17 00:00:00 2001 From: Finomnis Date: Mon, 25 Jul 2022 16:31:17 +0200 Subject: [PATCH 3/3] Fixed incorrect whitespace parsing; added more tests --- src/format/parse.rs | 5 +++++ src/format/scan.rs | 12 ++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/format/parse.rs b/src/format/parse.rs index 7ff5ca0c35..0e2db8fae9 100644 --- a/src/format/parse.rs +++ b/src/format/parse.rs @@ -832,6 +832,11 @@ fn test_rfc2822() { Ok("Tue, 20 Jan 2015 17:35:20 -0800"), ), // complex trailing comment (r"Tue, 20 Jan 2015 17:35:20 -0800 (UTC\)", Err(TOO_LONG)), // incorrect comment, not enough closing parentheses + ( + "Tue, 20 Jan 2015 17:35:20 -0800 (UTC)\t \r\n(Anothercomment)", + Ok("Tue, 20 Jan 2015 17:35:20 -0800"), + ), // multiple comments + ("Tue, 20 Jan 2015 17:35:20 -0800 (UTC) ", Err(TOO_LONG)), // trailing whitespace after comment ("20 Jan 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // no day of week ("20 JAN 2015 17:35:20 -0800", Ok("Tue, 20 Jan 2015 17:35:20 -0800")), // upper case month ("Tue, 20 Jan 2015 17:35 -0800", Ok("Tue, 20 Jan 2015 17:35:00 -0800")), // no second diff --git a/src/format/scan.rs b/src/format/scan.rs index cf139c0ad1..9a40903436 100644 --- a/src/format/scan.rs +++ b/src/format/scan.rs @@ -355,10 +355,11 @@ pub(super) fn timezone_name_skip(s: &str) -> ParseResult<(&str, ())> { pub(super) fn comment_2822(s: &str) -> ParseResult<(&str, ())> { use CommentState::*; + let s = s.trim_start(); + let mut state = Start; for (i, c) in s.bytes().enumerate() { state = match (state, c) { - (Start, b' ') => Start, (Start, b'(') => Next(1), (Next(1), b')') => return Ok((&s[i + 1..], ())), (Next(depth), b'\\') => Escape(depth), @@ -383,9 +384,12 @@ enum CommentState { fn test_rfc2822_comments() { let testdata = [ ("", Err(TOO_SHORT)), + (" ", Err(TOO_SHORT)), ("x", Err(INVALID)), ("(", Err(TOO_SHORT)), ("()", Ok("")), + (" \r\n\t()", Ok("")), + ("() ", Ok(" ")), ("()z", Ok("z")), ("(x)", Ok("")), ("(())", Ok("")), @@ -402,6 +406,10 @@ fn test_rfc2822_comments() { for (test_in, expected) in testdata { let actual = comment_2822(test_in).map(|(s, _)| s); - assert_eq!(expected, actual); + assert_eq!( + expected, actual, + "{:?} expected to produce {:?}, but produced {:?}.", + test_in, expected, actual + ); } }