From 3e702b9804a07f610948f731bbb8cfdcec8065a5 Mon Sep 17 00:00:00 2001 From: Alexander Akait <4567934+alexander-akait@users.noreply.github.com> Date: Fri, 18 Nov 2022 10:16:22 +0300 Subject: [PATCH] fix(es/parser): Fix parsing of regexp (#6469) **Related issue:** - Closes https://github.com/swc-project/swc/issues/6322. - Closes https://github.com/swc-project/swc/issues/6323. --- crates/swc_ecma_parser/src/lexer/mod.rs | 31 ++++---- crates/swc_ecma_parser/src/lexer/state.rs | 11 +++ crates/swc_ecma_parser/src/lexer/tests.rs | 62 +++++++++++++--- crates/swc_ecma_parser/src/parser/expr.rs | 70 ++++++++++++------- crates/swc_ecma_parser/src/parser/input.rs | 13 ++++ crates/swc_ecma_parser/src/parser/stmt.rs | 14 ++++ .../fail/095bea002b10b8e1.js.stderr | 4 +- .../fail/97fc32bf01227e39.js.stderr | 4 +- 8 files changed, 160 insertions(+), 49 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 26b1a3f46b52..4d54641ec7bf 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -695,12 +695,6 @@ impl<'a, I: Input> Lexer<'a, I> { #[inline(never)] fn read_slash(&mut self) -> LexResult> { debug_assert_eq!(self.cur(), Some('/')); - // let start = self.cur_pos(); - - // Regex - if self.state.is_expr_allowed { - return self.read_regexp().map(Some); - } // Divide operator self.bump(); @@ -1120,19 +1114,25 @@ impl<'a, I: Input> Lexer<'a, I> { } /// Expects current char to be '/' - fn read_regexp(&mut self) -> LexResult { + fn read_regexp(&mut self, start: BytePos) -> LexResult { + self.input.reset_to(start); + debug_assert_eq!(self.cur(), Some('/')); + let start = self.cur_pos(); + self.bump(); let (mut escaped, mut in_class) = (false, false); - // let content_start = self.cur_pos(); + let content = self.with_buf(|l, buf| { while let Some(c) = l.cur() { // This is ported from babel. // Seems like regexp literal cannot contain linebreak. if c.is_line_terminator() { - l.error(start, SyntaxError::UnterminatedRegExp)?; + let span = l.span(start); + + return Err(Error::new(span, SyntaxError::UnterminatedRegExp)); } if escaped { @@ -1145,20 +1145,22 @@ impl<'a, I: Input> Lexer<'a, I> { '/' if !in_class => break, _ => {} } + escaped = c == '\\'; } + l.bump(); buf.push(c); } Ok(Atom::new(&**buf)) })?; - // let content_span = Span::new(content_start, self.cur_pos(), - // Default::default()); // input is terminated without following `/` if !self.is(b'/') { - self.error(start, SyntaxError::UnterminatedRegExp)?; + let span = self.span(start); + + return Err(Error::new(span, SyntaxError::UnterminatedRegExp)); } self.bump(); // '/' @@ -1287,6 +1289,11 @@ impl<'a, I: Input> Lexer<'a, I> { pub fn set_expr_allowed(&mut self, allow: bool) { self.state.is_expr_allowed = allow; } + + #[inline] + pub fn set_next_regexp(&mut self, start: Option) { + self.state.next_regexp = start; + } } fn pos_span(p: BytePos) -> Span { diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index 0147297f9998..58cebcf42e49 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -22,6 +22,7 @@ use crate::{ #[derive(Clone)] pub(super) struct State { pub is_expr_allowed: bool, + pub next_regexp: Option, /// if line break exists between previous token and new token? pub had_line_break: bool, /// TODO: Remove this field. @@ -152,6 +153,11 @@ impl Tokens for Lexer<'_, I> { self.set_expr_allowed(allow) } + #[inline] + fn set_next_regexp(&mut self, start: Option) { + self.state.next_regexp = start; + } + #[inline] fn token_context(&self) -> &TokenContexts { &self.state.context @@ -191,6 +197,10 @@ impl<'a, I: Input> Iterator for Lexer<'a, I> { let mut start = self.cur_pos(); let res = (|| -> Result, _> { + if let Some(start) = self.state.next_regexp { + return Ok(Some(self.read_regexp(start)?)); + } + if self.state.is_first { if let Some(shebang) = self.read_shebang()? { return Ok(Some(Token::Shebang(shebang))); @@ -363,6 +373,7 @@ impl State { State { is_expr_allowed: true, + next_regexp: None, is_first: true, had_line_break: false, prev_hi: start_pos, diff --git a/crates/swc_ecma_parser/src/lexer/tests.rs b/crates/swc_ecma_parser/src/lexer/tests.rs index d665b2c6f1b1..940c8abb281d 100644 --- a/crates/swc_ecma_parser/src/lexer/tests.rs +++ b/crates/swc_ecma_parser/src/lexer/tests.rs @@ -416,7 +416,9 @@ fn regexp_unary_void() { lex(Syntax::default(), "void /test/"), vec![ Void.span(0..4).lb(), - Regex("test".into(), "".into()).span(5..11), + BinOp(Div).span(5), + Word(Word::Ident("test".into())).span(6..10), + BinOp(Div).span(10), ] ); assert_eq!( @@ -424,7 +426,9 @@ fn regexp_unary_void() { vec![ Void.span(0..4).lb(), LParen.span(5..6), - Regex("test".into(), "".into()).span(6..12), + BinOp(Div).span(6), + Word(Word::Ident("test".into())).span(7..11), + BinOp(Div).span(11), RParen.span(12..13), ] ); @@ -483,13 +487,28 @@ fn simple_regex() { vec![ "x".span(0).lb(), Assign.span(2), - Regex("42".into(), "i".into(),).span(4..9), + BinOp(Div).span(4), + 42.span(5..7), + BinOp(Div).span(7), + Word(Word::Ident("i".into())).span(8), ], ); assert_eq!( lex(Syntax::default(), "/42/"), - vec![Regex("42".into(), "".into()).span(0..4).lb(),] + vec![ + TokenAndSpan { + token: Token::BinOp(BinOpToken::Div), + had_line_break: true, + span: Span { + lo: BytePos(1), + hi: BytePos(2), + ctxt: Default::default(), + }, + }, + 42.span(1..3), + BinOp(Div).span(3) + ] ); } @@ -508,7 +527,13 @@ fn complex_regex() { RParen, LBrace, RBrace, - Regex("42".into(), "i".into(),), + BinOp(Div), + Num { + value: 42.0, + raw: Atom::new("42") + }, + BinOp(Div), + Word(Word::Ident("i".into())), ] ) } @@ -595,7 +620,9 @@ fn after_if() { RParen.span(4), LBrace.span(5), RBrace.span(6), - Regex("y".into(), "".into()).span(8..11), + Div.span(8), + "y".span(9), + Div.span(10), Dot.span(11), "test".span(12..16), LParen.span(16), @@ -639,7 +666,9 @@ fn migrated_0002() { vec![ "tokenize".span(0..8).lb(), LParen.span(8), - Regex("42".into(), "".into()).span(9..13), + BinOp(Div).span(9), + 42.span(10..12), + BinOp(Div).span(12), RParen.span(13), ], ) @@ -671,7 +700,9 @@ fn migrated_0004() { RParen.span(11), LBrace.span(12), RBrace.span(13), - Regex("42".into(), "".into()).span(15..19), + BinOp(Div).span(15), + 42.span(16..18), + BinOp(Div).span(18), ] ); } @@ -707,7 +738,20 @@ fn migrated_0006() { vec![ LBrace.span(0).lb(), RBrace.span(1), - Regex("42".into(), "".into()).span(3..7), + BinOp(Div).span(3), + TokenAndSpan { + token: Num { + value: 42.0, + raw: "42".into(), + }, + had_line_break: false, + span: Span { + lo: BytePos(5), + hi: BytePos(7), + ctxt: Default::default(), + } + }, + BinOp(Div).span(6), ], ) } diff --git a/crates/swc_ecma_parser/src/parser/expr.rs b/crates/swc_ecma_parser/src/parser/expr.rs index 227c6b4ec39b..3f73a9a70a66 100644 --- a/crates/swc_ecma_parser/src/parser/expr.rs +++ b/crates/swc_ecma_parser/src/parser/expr.rs @@ -332,31 +332,50 @@ impl Parser { } // Regexp - Token::Regex(..) => match bump!(self) { - Token::Regex(exp, flags) => { - let span = span!(self, start); - - let mut flags_count = flags.chars().fold( - AHashMap::::default(), - |mut map, flag| { - let key = match flag { - 'g' | 'i' | 'm' | 's' | 'u' | 'y' | 'd' => flag, - _ => '\u{0000}', // special marker for unknown flags - }; - map.entry(key).and_modify(|count| *count += 1).or_insert(1); - map - }, - ); - if flags_count.remove(&'\u{0000}').is_some() { - self.emit_err(span, SyntaxError::UnknownRegExpFlags); - } - if let Some((flag, _)) = flags_count.iter().find(|(_, count)| **count > 1) { - self.emit_err(span, SyntaxError::DuplicatedRegExpFlags(*flag)); + tok!('/') | tok!("/=") => { + bump!(self); + + self.input.set_next_regexp(Some(start)); + + if let Some(Token::Regex(..)) = self.input.cur() { + self.input.set_next_regexp(None); + + match bump!(self) { + Token::Regex(exp, flags) => { + let span = span!(self, start); + + let mut flags_count = flags.chars().fold( + AHashMap::::default(), + |mut map, flag| { + let key = match flag { + 'g' | 'i' | 'm' | 's' | 'u' | 'y' | 'd' => flag, + _ => '\u{0000}', // special marker for unknown flags + }; + map.entry(key).and_modify(|count| *count += 1).or_insert(1); + map + }, + ); + + if flags_count.remove(&'\u{0000}').is_some() { + self.emit_err(span, SyntaxError::UnknownRegExpFlags); + } + + if let Some((flag, _)) = + flags_count.iter().find(|(_, count)| **count > 1) + { + self.emit_err(span, SyntaxError::DuplicatedRegExpFlags(*flag)); + } + + return Ok(Box::new(Expr::Lit(Lit::Regex(Regex { + span, + exp, + flags, + })))); + } + _ => unreachable!(), } - return Ok(Box::new(Expr::Lit(Lit::Regex(Regex { span, exp, flags })))); } - _ => unreachable!(), - }, + } tok!('`') => { // parse template literal @@ -1880,7 +1899,10 @@ impl Parser { } if is!(self, ';') - || (!is!(self, '*') && !cur!(self, false).map(Token::starts_expr).unwrap_or(true)) + || (!is!(self, '*') + && !is!(self, '/') + && !is!(self, "/=") + && !cur!(self, false).map(Token::starts_expr).unwrap_or(true)) { Ok(Box::new(Expr::Yield(YieldExpr { span: span!(self, start), diff --git a/crates/swc_ecma_parser/src/parser/input.rs b/crates/swc_ecma_parser/src/parser/input.rs index 2931705f8371..e0c46b735feb 100644 --- a/crates/swc_ecma_parser/src/parser/input.rs +++ b/crates/swc_ecma_parser/src/parser/input.rs @@ -24,6 +24,8 @@ pub trait Tokens: Clone + Iterator { } fn set_expr_allowed(&mut self, allow: bool); + fn set_next_regexp(&mut self, start: Option); + fn token_context(&self) -> &lexer::TokenContexts; fn token_context_mut(&mut self) -> &mut lexer::TokenContexts; fn set_token_context(&mut self, _c: lexer::TokenContexts); @@ -110,6 +112,8 @@ impl Tokens for TokensInput { fn set_expr_allowed(&mut self, _: bool) {} + fn set_next_regexp(&mut self, _: Option) {} + fn token_context(&self) -> &TokenContexts { &self.token_ctx } @@ -222,6 +226,10 @@ impl Tokens for Capturing { self.inner.set_expr_allowed(allow) } + fn set_next_regexp(&mut self, start: Option) { + self.inner.set_next_regexp(start); + } + fn token_context(&self) -> &TokenContexts { self.inner.token_context() } @@ -467,6 +475,11 @@ impl Buffer { self.iter.set_expr_allowed(allow) } + #[inline] + pub fn set_next_regexp(&mut self, start: Option) { + self.iter.set_next_regexp(start); + } + #[inline] pub(crate) fn token_context(&self) -> &lexer::TokenContexts { self.iter.token_context() diff --git a/crates/swc_ecma_parser/src/parser/stmt.rs b/crates/swc_ecma_parser/src/parser/stmt.rs index 3ad1bfbf285f..e73824a2701a 100644 --- a/crates/swc_ecma_parser/src/parser/stmt.rs +++ b/crates/swc_ecma_parser/src/parser/stmt.rs @@ -2527,4 +2527,18 @@ const foo;"#; test_parser(src, Default::default(), |p| p.parse_script()); } + + #[test] + fn issue_6322() { + let src = "for ( ; { } / 1 ; ) ;"; + + test_parser(src, Default::default(), |p| p.parse_script()); + } + + #[test] + fn issue_6323() { + let src = "let x = 0 < { } / 0 ;"; + + test_parser(src, Default::default(), |p| p.parse_script()); + } } diff --git a/crates/swc_ecma_parser/tests/test262-error-references/fail/095bea002b10b8e1.js.stderr b/crates/swc_ecma_parser/tests/test262-error-references/fail/095bea002b10b8e1.js.stderr index 42808f21783e..16104415f098 100644 --- a/crates/swc_ecma_parser/tests/test262-error-references/fail/095bea002b10b8e1.js.stderr +++ b/crates/swc_ecma_parser/tests/test262-error-references/fail/095bea002b10b8e1.js.stderr @@ -1,6 +1,6 @@ - x Unexpected eof + x Unterminated regexp literal ,-[$DIR/tests/test262-parser/fail/095bea002b10b8e1.js:1:1] 1 | foo[/42 - : ^ + : ^^^ `---- diff --git a/crates/swc_ecma_parser/tests/test262-error-references/fail/97fc32bf01227e39.js.stderr b/crates/swc_ecma_parser/tests/test262-error-references/fail/97fc32bf01227e39.js.stderr index 4c1545894580..2c28f7c8cfc8 100644 --- a/crates/swc_ecma_parser/tests/test262-error-references/fail/97fc32bf01227e39.js.stderr +++ b/crates/swc_ecma_parser/tests/test262-error-references/fail/97fc32bf01227e39.js.stderr @@ -1,6 +1,6 @@ - x Unexpected eof + x Unterminated regexp literal ,-[$DIR/tests/test262-parser/fail/97fc32bf01227e39.js:1:1] 1 | [/[/] - : ^ + : ^^^^ `----