From 1e905a71f176da17a26c0db75bb3c570e6499c2b Mon Sep 17 00:00:00 2001 From: Ofek Lev Date: Sun, 14 Nov 2021 18:01:27 -0500 Subject: [PATCH 1/6] Lazily compile regular expressions to speed up load time --- tomli/_parser.py | 15 ++++----------- tomli/_re.py | 45 +++++++++++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/tomli/_parser.py b/tomli/_parser.py index 89e81c3..4d326b5 100644 --- a/tomli/_parser.py +++ b/tomli/_parser.py @@ -3,14 +3,7 @@ from typing import Any, BinaryIO, Dict, FrozenSet, Iterable, NamedTuple, Optional, Tuple import warnings -from tomli._re import ( - RE_DATETIME, - RE_LOCALTIME, - RE_NUMBER, - match_to_datetime, - match_to_localtime, - match_to_number, -) +from tomli._re import Patterns, match_to_datetime, match_to_localtime, match_to_number from tomli._types import Key, ParseFloat, Pos ASCII_CTRL = frozenset(chr(i) for i in range(32)) | frozenset(chr(127)) @@ -605,21 +598,21 @@ def parse_value( # noqa: C901 return pos + 5, False # Dates and times - datetime_match = RE_DATETIME.match(src, pos) + datetime_match = Patterns.datetime.match(src, pos) if datetime_match: try: datetime_obj = match_to_datetime(datetime_match) except ValueError as e: raise suffixed_err(src, pos, "Invalid date or datetime") from e return datetime_match.end(), datetime_obj - localtime_match = RE_LOCALTIME.match(src, pos) + localtime_match = Patterns.localtime.match(src, pos) if localtime_match: return localtime_match.end(), match_to_localtime(localtime_match) # Integers and "normal" floats. # The regex will greedily match any type starting with a decimal # char, so needs to be located after handling of dates and times. - number_match = RE_NUMBER.match(src, pos) + number_match = Patterns.number.match(src, pos) if number_match: return number_match.end(), match_to_number(number_match, parse_float) diff --git a/tomli/_re.py b/tomli/_re.py index 9126829..dd47526 100644 --- a/tomli/_re.py +++ b/tomli/_re.py @@ -10,8 +10,26 @@ # - 00:32:00 _TIME_RE_STR = r"([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(?:\.([0-9]{1,6})[0-9]*)?" -RE_NUMBER = re.compile( - r""" + +class _LazyPatternCompiler: + def __getattr__(self, name: str) -> "re.Pattern": + if name == "datetime": + pattern = re.compile( + fr""" +([0-9]{{4}})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01]) # date, e.g. 1988-10-27 +(?: + [T ] + {_TIME_RE_STR} + (?:(Z)|([+-])([01][0-9]|2[0-3]):([0-5][0-9]))? # optional time offset +)? +""", + flags=re.VERBOSE, + ) + elif name == "localtime": + pattern = re.compile(_TIME_RE_STR) + elif name == "number": + pattern = re.compile( + r""" 0 (?: x[0-9A-Fa-f](?:_?[0-9A-Fa-f])* # hex @@ -27,20 +45,15 @@ (?:[eE][+-]?[0-9](?:_?[0-9])*)? # optional exponent part ) """, - flags=re.VERBOSE, -) -RE_LOCALTIME = re.compile(_TIME_RE_STR) -RE_DATETIME = re.compile( - fr""" -([0-9]{{4}})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01]) # date, e.g. 1988-10-27 -(?: - [T ] - {_TIME_RE_STR} - (?:(Z)|([+-])([01][0-9]|2[0-3]):([0-5][0-9]))? # optional time offset -)? -""", - flags=re.VERBOSE, -) + flags=re.VERBOSE, + ) + else: # pragma: no cover + raise AttributeError(f"Unknown pattern: {name}") + + setattr(self, name, pattern) + return pattern + +Patterns = _LazyPatternCompiler() def match_to_datetime(match: "re.Match") -> Union[datetime, date]: From ac5cd49b456f3cb2a6c9bf8ff05bb61154770675 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 14 Nov 2021 23:03:06 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tomli/_re.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tomli/_re.py b/tomli/_re.py index dd47526..231690c 100644 --- a/tomli/_re.py +++ b/tomli/_re.py @@ -53,6 +53,7 @@ def __getattr__(self, name: str) -> "re.Pattern": setattr(self, name, pattern) return pattern + Patterns = _LazyPatternCompiler() From 1e391ddf028c7b43299dffcb9a1ae90bfef8a8f0 Mon Sep 17 00:00:00 2001 From: Ofek Lev Date: Sun, 14 Nov 2021 18:59:23 -0500 Subject: [PATCH 3/6] re-order conditions based on speed of checking and likelihood Co-Authored-By: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com> --- tomli/_parser.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/tomli/_parser.py b/tomli/_parser.py index 4d326b5..6163ce1 100644 --- a/tomli/_parser.py +++ b/tomli/_parser.py @@ -577,6 +577,8 @@ def parse_value( # noqa: C901 except IndexError: char = None + # IMPORTANT: order conditions based on speed of checking and likelihood + # Basic strings if char == '"': if src.startswith('"""', pos): @@ -597,6 +599,21 @@ def parse_value( # noqa: C901 if src.startswith("false", pos): return pos + 5, False + # Arrays + if char == "[": + return parse_array(src, pos, parse_float) + + # Inline tables + if char == "{": + return parse_inline_table(src, pos, parse_float) + + # Integers and "normal" floats. + # The regex will greedily match any type starting with a decimal + # char, so needs to be located after handling of dates and times. + number_match = Patterns.number.match(src, pos) + if number_match: + return number_match.end(), match_to_number(number_match, parse_float) + # Dates and times datetime_match = Patterns.datetime.match(src, pos) if datetime_match: @@ -609,21 +626,6 @@ def parse_value( # noqa: C901 if localtime_match: return localtime_match.end(), match_to_localtime(localtime_match) - # Integers and "normal" floats. - # The regex will greedily match any type starting with a decimal - # char, so needs to be located after handling of dates and times. - number_match = Patterns.number.match(src, pos) - if number_match: - return number_match.end(), match_to_number(number_match, parse_float) - - # Arrays - if char == "[": - return parse_array(src, pos, parse_float) - - # Inline tables - if char == "{": - return parse_inline_table(src, pos, parse_float) - # Special floats first_three = src[pos : pos + 3] if first_three in {"inf", "nan"}: From cec94dd0add931d34a6c452a5d1ef1e6d2947639 Mon Sep 17 00:00:00 2001 From: Ofek Lev Date: Sun, 14 Nov 2021 19:07:07 -0500 Subject: [PATCH 4/6] fix --- tomli/_parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tomli/_parser.py b/tomli/_parser.py index 6163ce1..d068f8d 100644 --- a/tomli/_parser.py +++ b/tomli/_parser.py @@ -607,13 +607,6 @@ def parse_value( # noqa: C901 if char == "{": return parse_inline_table(src, pos, parse_float) - # Integers and "normal" floats. - # The regex will greedily match any type starting with a decimal - # char, so needs to be located after handling of dates and times. - number_match = Patterns.number.match(src, pos) - if number_match: - return number_match.end(), match_to_number(number_match, parse_float) - # Dates and times datetime_match = Patterns.datetime.match(src, pos) if datetime_match: @@ -626,6 +619,13 @@ def parse_value( # noqa: C901 if localtime_match: return localtime_match.end(), match_to_localtime(localtime_match) + # Integers and "normal" floats. + # The regex will greedily match any type starting with a decimal + # char, so needs to be located after handling of dates and times. + number_match = Patterns.number.match(src, pos) + if number_match: + return number_match.end(), match_to_number(number_match, parse_float) + # Special floats first_three = src[pos : pos + 3] if first_three in {"inf", "nan"}: From cb40c82d1818cfc1c9457b68d34722c255ebdab5 Mon Sep 17 00:00:00 2001 From: Ofek Lev Date: Sun, 14 Nov 2021 22:21:51 -0500 Subject: [PATCH 5/6] final update --- tomli/_parser.py | 16 ++++++++++++---- tomli/_re.py | 46 ++++++++++++++++------------------------------ 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/tomli/_parser.py b/tomli/_parser.py index d068f8d..6200cc4 100644 --- a/tomli/_parser.py +++ b/tomli/_parser.py @@ -3,7 +3,14 @@ from typing import Any, BinaryIO, Dict, FrozenSet, Iterable, NamedTuple, Optional, Tuple import warnings -from tomli._re import Patterns, match_to_datetime, match_to_localtime, match_to_number +from tomli._re import ( + RE_DATETIME, + RE_LOCALTIME, + RE_NUMBER, + match_to_datetime, + match_to_localtime, + match_to_number, +) from tomli._types import Key, ParseFloat, Pos ASCII_CTRL = frozenset(chr(i) for i in range(32)) | frozenset(chr(127)) @@ -599,6 +606,7 @@ def parse_value( # noqa: C901 if src.startswith("false", pos): return pos + 5, False + # Arrays if char == "[": return parse_array(src, pos, parse_float) @@ -608,21 +616,21 @@ def parse_value( # noqa: C901 return parse_inline_table(src, pos, parse_float) # Dates and times - datetime_match = Patterns.datetime.match(src, pos) + datetime_match = RE_DATETIME.match(src, pos) if datetime_match: try: datetime_obj = match_to_datetime(datetime_match) except ValueError as e: raise suffixed_err(src, pos, "Invalid date or datetime") from e return datetime_match.end(), datetime_obj - localtime_match = Patterns.localtime.match(src, pos) + localtime_match = RE_LOCALTIME.match(src, pos) if localtime_match: return localtime_match.end(), match_to_localtime(localtime_match) # Integers and "normal" floats. # The regex will greedily match any type starting with a decimal # char, so needs to be located after handling of dates and times. - number_match = Patterns.number.match(src, pos) + number_match = RE_NUMBER.match(src, pos) if number_match: return number_match.end(), match_to_number(number_match, parse_float) diff --git a/tomli/_re.py b/tomli/_re.py index 231690c..9126829 100644 --- a/tomli/_re.py +++ b/tomli/_re.py @@ -10,26 +10,8 @@ # - 00:32:00 _TIME_RE_STR = r"([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(?:\.([0-9]{1,6})[0-9]*)?" - -class _LazyPatternCompiler: - def __getattr__(self, name: str) -> "re.Pattern": - if name == "datetime": - pattern = re.compile( - fr""" -([0-9]{{4}})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01]) # date, e.g. 1988-10-27 -(?: - [T ] - {_TIME_RE_STR} - (?:(Z)|([+-])([01][0-9]|2[0-3]):([0-5][0-9]))? # optional time offset -)? -""", - flags=re.VERBOSE, - ) - elif name == "localtime": - pattern = re.compile(_TIME_RE_STR) - elif name == "number": - pattern = re.compile( - r""" +RE_NUMBER = re.compile( + r""" 0 (?: x[0-9A-Fa-f](?:_?[0-9A-Fa-f])* # hex @@ -45,16 +27,20 @@ def __getattr__(self, name: str) -> "re.Pattern": (?:[eE][+-]?[0-9](?:_?[0-9])*)? # optional exponent part ) """, - flags=re.VERBOSE, - ) - else: # pragma: no cover - raise AttributeError(f"Unknown pattern: {name}") - - setattr(self, name, pattern) - return pattern - - -Patterns = _LazyPatternCompiler() + flags=re.VERBOSE, +) +RE_LOCALTIME = re.compile(_TIME_RE_STR) +RE_DATETIME = re.compile( + fr""" +([0-9]{{4}})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01]) # date, e.g. 1988-10-27 +(?: + [T ] + {_TIME_RE_STR} + (?:(Z)|([+-])([01][0-9]|2[0-3]):([0-5][0-9]))? # optional time offset +)? +""", + flags=re.VERBOSE, +) def match_to_datetime(match: "re.Match") -> Union[datetime, date]: From 9d2f08efd062bf2ea7313b196d36c2618d98a5a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Nov 2021 03:22:07 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tomli/_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tomli/_parser.py b/tomli/_parser.py index 6200cc4..ccd6b92 100644 --- a/tomli/_parser.py +++ b/tomli/_parser.py @@ -606,7 +606,6 @@ def parse_value( # noqa: C901 if src.startswith("false", pos): return pos + 5, False - # Arrays if char == "[": return parse_array(src, pos, parse_float)