Fix #474 - redo QuotedString '\' escape handling as a state machine s…

…o that all transforms are done left to right
pyparsing · Mar 28, 2023 · d46eb9e · d46eb9e
1 parent 1419802
commit d46eb9e
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 19 deletions.
diff --git a/CHANGES b/CHANGES
@@ -2,8 +2,31 @@
 Change Log
 ==========
 
+NOTE: In the future release 3.2.0, use of many of the pre-PEP8 methods (such as
+`ParserElement.parseString`) will start to raise `DeprecationWarnings`. 3.2.0 should
+get released some time later in 2023. I currently plan to completely
+drop the pre-PEP8 methods in pyparsing 4.0, though we won't see that release until
+at least late 2023 if not 2024. So there is plenty of time to convert existing parsers to
+the new function names before the old functions are completely removed. (Big
+help from Devin J. Pohly in structuring the code to enable this peaceful transition.)
+
+Version 3.2.0 will also discontinue support for Python versions 3.6 and 3.7.
+
+
 Version 3.1.0a2 - (in development)
 ----------------------------------
+- API CHANGE: A slight change has been implemented when unquoting a quoted string
+  parsed using the QuotedString class. Formerly, when unquoting and processing
+  whitespace markers such as \t and \n, these substitutions would occur first, and
+  then any additional '\' escaping would be done on the resulting string. This would
+  parse "\\n" as "\<newline>". Now escapes and whitespace markers are all processed
+  in a single pass working left to right, so the quoted string "\\n" would get unquoted
+  to "\n" (a backslash followed by "n"). Fixes issue #474 raised by jakeanq,
+  thanks!
+
+- Added named field "url" to pyparsing.common.url, returning the entire
+  parsed URL string.
+
 - Fixed bug when parse actions returned an empty string for an expression that
   had a results name, that the results name was not saved. That is:
 
@@ -27,16 +50,6 @@ Version 3.1.0a2 - (in development)
 
 Version 3.1.0a1 - March, 2023
 -----------------------------
-NOTE: In the future release 3.2.0, use of many of the pre-PEP8 methods (such as
-`ParserElement.parseString`) will start to raise `DeprecationWarnings`. 3.2.0 should
-get released some time later in 2023. I currently plan to completely
-drop the pre-PEP8 methods in pyparsing 4.0, though we won't see that release until
-at least late 2023 if not 2024. So there is plenty of time to convert existing parsers to
-the new function names before the old functions are completely removed. (Big
-help from Devin J. Pohly in structuring the code to enable this peaceful transition.)
-
-Version 3.2.0 will also discontinue support for Python versions 3.6 and 3.7.
-
 - API ENHANCEMENT: `Optional(expr)` may now be written as `expr | ""`
 
   This will make this code:

diff --git a/pyparsing/core.py b/pyparsing/core.py
@@ -3194,7 +3194,7 @@ class QuotedString(Token):
         [['This is the "quote"']]
         [['This is the quote with "embedded" quotes']]
     """
-    ws_map = ((r"\t", "\t"), (r"\n", "\n"), (r"\f", "\f"), (r"\r", "\r"))
+    ws_map = dict(((r"\t", "\t"), (r"\n", "\n"), (r"\f", "\f"), (r"\r", "\r")))
 
     def __init__(
         self,
@@ -3244,6 +3244,7 @@ def __init__(
         self.escQuote: str = escQuote or ""
         self.unquoteResults: bool = unquoteResults
         self.convertWhitespaceEscapes: bool = convertWhitespaceEscapes
+        self.multiline = multiline
 
         sep = ""
         inner_pattern = ""
@@ -3292,6 +3293,17 @@ def __init__(
             ]
         )
 
+        if self.unquoteResults:
+            if self.convertWhitespaceEscapes:
+                self.unquote_scan_re = re.compile(
+                    rf"({'|'.join(re.escape(k) for k in self.ws_map)})|({re.escape(self.escChar)}.)|(\n|.)",
+                    flags=self.flags,
+                )
+            else:
+                self.unquote_scan_re = re.compile(
+                    rf"({re.escape(self.escChar)}.)|(\n|.)", flags=self.flags
+                )
+
         try:
             self.re = re.compile(self.pattern, self.flags)
             self.reString = self.pattern
@@ -3327,14 +3339,20 @@ def parseImpl(self, instring, loc, doActions=True):
             ret = ret[self.quoteCharLen : -self.endQuoteCharLen]
 
             if isinstance(ret, str_type):
-                # replace escaped whitespace
-                if "\\" in ret and self.convertWhitespaceEscapes:
-                    for wslit, wschar in self.ws_map:
-                        ret = ret.replace(wslit, wschar)
-
-                # replace escaped characters
-                if self.escChar:
-                    ret = re.sub(self.escCharReplacePattern, r"\g<1>", ret)
+                if self.convertWhitespaceEscapes:
+                    ret = "".join(
+                        self.ws_map[match.group(1)]
+                        if match.group(1)
+                        else match.group(2)[-1]
+                        if match.group(2)
+                        else match.group(3)
+                        for match in self.unquote_scan_re.finditer(ret)
+                    )
+                else:
+                    ret = "".join(
+                        match.group(1)[-1] if match.group(1) else match.group(2)
+                        for match in self.unquote_scan_re.finditer(ret)
+                    )
 
                 # replace escaped quotes
                 if self.escQuote:

diff --git a/tests/test_unit.py b/tests/test_unit.py
@@ -1265,6 +1265,63 @@ def testQuotedStrings(self):
             )
             self.assertEqual(source, stripped)
 
+    def testQuotedStringUnquotesAndConvertWhitespaceEscapes(self):
+        # test for Issue #474
+        #fmt: off
+        backslash = chr(92)  # a single backslash
+        tab = "\t"
+        newline = "\n"
+        test_string_0 = f'"{backslash}{backslash}n"'              # r"\\n"
+        test_string_1 = f'"{backslash}t{backslash}{backslash}n"'  # r"\t\\n"
+        test_string_2 = f'"a{backslash}tb"'                       # r"a\tb"
+        test_string_3 = f'"{backslash}{backslash}{backslash}n"'   # r"\\\n"
+        T, F = True, False  # these make the test cases format nicely
+        for test_parameters in (
+                # Parameters are the arguments to creating a QuotedString
+                # and the expected parsed list of characters):
+                # - unquote_results
+                # - convert_whitespace_escapes
+                # - test string
+                # - expected parsed characters (broken out as separate
+                #   list items (all those doubled backslashes make it
+                #   difficult to interpret the output)
+                (T, T, test_string_0, [backslash, "n"]),
+                (T, F, test_string_0, [backslash, "n"]),
+                (F, F, test_string_0, ['"', backslash, backslash, "n", '"']),
+                (T, T, test_string_1, [tab, backslash, "n"]),
+                (T, F, test_string_1, ["t", backslash, "n"]),
+                (F, F, test_string_1, ['"', backslash, "t", backslash, backslash, "n", '"']),
+                (T, T, test_string_2, ["a", tab, "b"]),
+                (T, F, test_string_2, ["a", "t", "b"]),
+                (F, F, test_string_2, ['"', "a", backslash, "t", "b", '"']),
+                (T, T, test_string_3, [backslash, newline]),
+                (T, F, test_string_3, [backslash, "n"]),
+                (F, F, test_string_3, ['"', backslash, backslash, backslash, "n", '"']),
+        ):
+            unquote_results, convert_ws_escapes, test_string, expected_list = test_parameters
+            test_description = f"Testing with parameters {test_parameters}"
+            with self.subTest(msg=test_description):
+                print(test_description)
+                print(f"unquote_results: {unquote_results}"
+                      f"\nconvert_whitespace_escapes: {convert_ws_escapes}")
+                qs_expr = pp.QuotedString(
+                        quoteChar='"',
+                        escChar='\\',
+                        unquote_results=unquote_results,
+                        convert_whitespace_escapes=convert_ws_escapes
+                    )
+                result = qs_expr.parse_string(test_string)
+
+                # do this instead of assertParserAndCheckList to explicitly
+                # check and display the separate items in the list
+                print("Results:")
+                control_chars = {newline: "<NEWLINE>", backslash: "<BACKSLASH>", tab: "<TAB>"}
+                print(f"[{', '.join(control_chars.get(c, repr(c)) for c in result[0])}]")
+                self.assertEqual(expected_list, list(result[0]))
+
+                print()
+        #fmt: on
+
     def testCaselessOneOf(self):
         caseless1 = pp.oneOf("d a b c aA B A C", caseless=True)
         caseless1str = str(caseless1)