Skip to content

Commit

Permalink
Improved javascript regex regocnizing for extracting js messages
Browse files Browse the repository at this point in the history
  • Loading branch information
gitaarik committed Feb 19, 2022
1 parent 4f8c7f6 commit fdcac82
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 1 deletion.
9 changes: 9 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
Babel Changelog
===============

Next version
--------------

Bugfixes
~~~~~~~~

* Regex for parsing JavaScript regexes improved. Before this, the lexer couldn't recognize certain regexes,
breaking the parsing of JS files.

Version 2.9.1
-------------

Expand Down
52 changes: 51 additions & 1 deletion babel/messages/jslexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,57 @@
name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
division_re = re.compile(r'/=?')
regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)

regex_re = re.compile(
r'''
# Opening slash of the regex
/
(?:
# 1) Blackslashed character
#
# Match a backslash `\` and then it's following character, allowing
# to blackslash the `/` for example.
(?:\\.)?
|
# 2) Regex character class `[a-z]`
#
# Match regex character class, like `[a-z]`. Inside a character
# class, a `/` character may appear, which does not close the
# regex. Therefore we allow it here inside a character class.
\[
(?:
[^\]]*
|
\\\]
)*
\]
|
# 3) Other characters
#
# Match anything except a closing slash `/`, a backslash `\`, or a
# opening bracket `[`. Those last two will be handled by the other
# matchers.
[^/\\\[]*
)*
# Closing slash of the regex
/
# regex flags
[a-zA-Z]*
''',
re.DOTALL + re.VERBOSE
)

line_re = re.compile(r'(\r\n|\n|\r)')
line_join_re = re.compile(r'\\' + line_re.pattern)
uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
Expand Down
29 changes: 29 additions & 0 deletions tests/messages/test_js_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,32 @@ def test_template_string_tag_usage():
)

assert messages == [(1, 'Tag template, wow', [], None)]


def test_regex_with_non_escaped_slash():
"""
Test if regexes with non-escaped slashes are parsed correctly.
A Javascript regex that is opened and closed with slashes, allows a
non-escaped slash inside a character class, like: [/]. In the past, the
babel JS lexer thought this closed the regex.
If a " followed the falsly closing /, then babel thought a javascript
string was started, and would stretch it to the next quote. This caused the
bug.
The regex in babel/messages/jslexer.py now covers this scenario, and this
unit test makes sure it works.
"""
buf = BytesIO(b"""\
msg1 = _('message 1')
regex1 = /[/]"/
msg2 = _('message 2')
fake_closing_quote = '"'
""")
messages = \
list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS,
[], {}))

assert messages == [(1, 'message 1', [], None),
(3, 'message 2', [], None)]

0 comments on commit fdcac82

Please sign in to comment.