Skip to content

Commit

Permalink
Fix #474 - redo QuotedString '\' escape handling as a state machine s…
Browse files Browse the repository at this point in the history
…o that all transforms are done left to right
  • Loading branch information
ptmcg committed Mar 28, 2023
1 parent 1419802 commit d46eb9e
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 19 deletions.
33 changes: 23 additions & 10 deletions CHANGES
Expand Up @@ -2,8 +2,31 @@
Change Log
==========

NOTE: In the future release 3.2.0, use of many of the pre-PEP8 methods (such as
`ParserElement.parseString`) will start to raise `DeprecationWarnings`. 3.2.0 should
get released some time later in 2023. I currently plan to completely
drop the pre-PEP8 methods in pyparsing 4.0, though we won't see that release until
at least late 2023 if not 2024. So there is plenty of time to convert existing parsers to
the new function names before the old functions are completely removed. (Big
help from Devin J. Pohly in structuring the code to enable this peaceful transition.)

Version 3.2.0 will also discontinue support for Python versions 3.6 and 3.7.


Version 3.1.0a2 - (in development)
----------------------------------
- API CHANGE: A slight change has been implemented when unquoting a quoted string
parsed using the QuotedString class. Formerly, when unquoting and processing
whitespace markers such as \t and \n, these substitutions would occur first, and
then any additional '\' escaping would be done on the resulting string. This would
parse "\\n" as "\<newline>". Now escapes and whitespace markers are all processed
in a single pass working left to right, so the quoted string "\\n" would get unquoted
to "\n" (a backslash followed by "n"). Fixes issue #474 raised by jakeanq,
thanks!

- Added named field "url" to pyparsing.common.url, returning the entire
parsed URL string.

- Fixed bug when parse actions returned an empty string for an expression that
had a results name, that the results name was not saved. That is:

Expand All @@ -27,16 +50,6 @@ Version 3.1.0a2 - (in development)

Version 3.1.0a1 - March, 2023
-----------------------------
NOTE: In the future release 3.2.0, use of many of the pre-PEP8 methods (such as
`ParserElement.parseString`) will start to raise `DeprecationWarnings`. 3.2.0 should
get released some time later in 2023. I currently plan to completely
drop the pre-PEP8 methods in pyparsing 4.0, though we won't see that release until
at least late 2023 if not 2024. So there is plenty of time to convert existing parsers to
the new function names before the old functions are completely removed. (Big
help from Devin J. Pohly in structuring the code to enable this peaceful transition.)

Version 3.2.0 will also discontinue support for Python versions 3.6 and 3.7.

- API ENHANCEMENT: `Optional(expr)` may now be written as `expr | ""`

This will make this code:
Expand Down
36 changes: 27 additions & 9 deletions pyparsing/core.py
Expand Up @@ -3194,7 +3194,7 @@ class QuotedString(Token):
[['This is the "quote"']]
[['This is the quote with "embedded" quotes']]
"""
ws_map = ((r"\t", "\t"), (r"\n", "\n"), (r"\f", "\f"), (r"\r", "\r"))
ws_map = dict(((r"\t", "\t"), (r"\n", "\n"), (r"\f", "\f"), (r"\r", "\r")))

def __init__(
self,
Expand Down Expand Up @@ -3244,6 +3244,7 @@ def __init__(
self.escQuote: str = escQuote or ""
self.unquoteResults: bool = unquoteResults
self.convertWhitespaceEscapes: bool = convertWhitespaceEscapes
self.multiline = multiline

sep = ""
inner_pattern = ""
Expand Down Expand Up @@ -3292,6 +3293,17 @@ def __init__(
]
)

if self.unquoteResults:
if self.convertWhitespaceEscapes:
self.unquote_scan_re = re.compile(
rf"({'|'.join(re.escape(k) for k in self.ws_map)})|({re.escape(self.escChar)}.)|(\n|.)",
flags=self.flags,
)
else:
self.unquote_scan_re = re.compile(
rf"({re.escape(self.escChar)}.)|(\n|.)", flags=self.flags
)

try:
self.re = re.compile(self.pattern, self.flags)
self.reString = self.pattern
Expand Down Expand Up @@ -3327,14 +3339,20 @@ def parseImpl(self, instring, loc, doActions=True):
ret = ret[self.quoteCharLen : -self.endQuoteCharLen]

if isinstance(ret, str_type):
# replace escaped whitespace
if "\\" in ret and self.convertWhitespaceEscapes:
for wslit, wschar in self.ws_map:
ret = ret.replace(wslit, wschar)

# replace escaped characters
if self.escChar:
ret = re.sub(self.escCharReplacePattern, r"\g<1>", ret)
if self.convertWhitespaceEscapes:
ret = "".join(
self.ws_map[match.group(1)]
if match.group(1)
else match.group(2)[-1]
if match.group(2)
else match.group(3)
for match in self.unquote_scan_re.finditer(ret)
)
else:
ret = "".join(
match.group(1)[-1] if match.group(1) else match.group(2)
for match in self.unquote_scan_re.finditer(ret)
)

# replace escaped quotes
if self.escQuote:
Expand Down
57 changes: 57 additions & 0 deletions tests/test_unit.py
Expand Up @@ -1265,6 +1265,63 @@ def testQuotedStrings(self):
)
self.assertEqual(source, stripped)

def testQuotedStringUnquotesAndConvertWhitespaceEscapes(self):
# test for Issue #474
#fmt: off
backslash = chr(92) # a single backslash
tab = "\t"
newline = "\n"
test_string_0 = f'"{backslash}{backslash}n"' # r"\\n"
test_string_1 = f'"{backslash}t{backslash}{backslash}n"' # r"\t\\n"
test_string_2 = f'"a{backslash}tb"' # r"a\tb"
test_string_3 = f'"{backslash}{backslash}{backslash}n"' # r"\\\n"
T, F = True, False # these make the test cases format nicely
for test_parameters in (
# Parameters are the arguments to creating a QuotedString
# and the expected parsed list of characters):
# - unquote_results
# - convert_whitespace_escapes
# - test string
# - expected parsed characters (broken out as separate
# list items (all those doubled backslashes make it
# difficult to interpret the output)
(T, T, test_string_0, [backslash, "n"]),
(T, F, test_string_0, [backslash, "n"]),
(F, F, test_string_0, ['"', backslash, backslash, "n", '"']),
(T, T, test_string_1, [tab, backslash, "n"]),
(T, F, test_string_1, ["t", backslash, "n"]),
(F, F, test_string_1, ['"', backslash, "t", backslash, backslash, "n", '"']),
(T, T, test_string_2, ["a", tab, "b"]),
(T, F, test_string_2, ["a", "t", "b"]),
(F, F, test_string_2, ['"', "a", backslash, "t", "b", '"']),
(T, T, test_string_3, [backslash, newline]),
(T, F, test_string_3, [backslash, "n"]),
(F, F, test_string_3, ['"', backslash, backslash, backslash, "n", '"']),
):
unquote_results, convert_ws_escapes, test_string, expected_list = test_parameters
test_description = f"Testing with parameters {test_parameters}"
with self.subTest(msg=test_description):
print(test_description)
print(f"unquote_results: {unquote_results}"
f"\nconvert_whitespace_escapes: {convert_ws_escapes}")
qs_expr = pp.QuotedString(
quoteChar='"',
escChar='\\',
unquote_results=unquote_results,
convert_whitespace_escapes=convert_ws_escapes
)
result = qs_expr.parse_string(test_string)

# do this instead of assertParserAndCheckList to explicitly
# check and display the separate items in the list
print("Results:")
control_chars = {newline: "<NEWLINE>", backslash: "<BACKSLASH>", tab: "<TAB>"}
print(f"[{', '.join(control_chars.get(c, repr(c)) for c in result[0])}]")
self.assertEqual(expected_list, list(result[0]))

print()
#fmt: on

def testCaselessOneOf(self):
caseless1 = pp.oneOf("d a b c aA B A C", caseless=True)
caseless1str = str(caseless1)
Expand Down

0 comments on commit d46eb9e

Please sign in to comment.