/
parser.py
490 lines (388 loc) · 16.5 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import re
from datetime import datetime, timedelta
from dateutil import tz
from arrow import locales
try:
from functools import lru_cache
except ImportError: # pragma: no cover
from backports.functools_lru_cache import lru_cache # pragma: no cover
class ParserError(ValueError):
pass
# Allows for ParserErrors to be propagated from _build_datetime()
# when day_of_year errors occur.
# Before this, the ParserErrors were caught by the try/except in
# _parse_multiformat() and the appropriate error message was not
# transmitted to the user.
class ParserMatchError(ParserError):
pass
class DateTimeParser(object):
_FORMAT_RE = re.compile(
r"(YYY?Y?|MM?M?M?|Do|DD?D?D?|d?d?d?d|HH?|hh?|mm?|ss?|S+|ZZ?Z?|a|A|X)"
)
_ESCAPE_RE = re.compile(r"\[[^\[\]]*\]")
_ONE_OR_TWO_DIGIT_RE = re.compile(r"\d{1,2}")
_ONE_OR_TWO_OR_THREE_DIGIT_RE = re.compile(r"\d{1,3}")
_ONE_OR_MORE_DIGIT_RE = re.compile(r"\d+")
_TWO_DIGIT_RE = re.compile(r"\d{2}")
_THREE_DIGIT_RE = re.compile(r"\d{3}")
_FOUR_DIGIT_RE = re.compile(r"\d{4}")
_TZ_Z_RE = re.compile(r"([\+\-])(\d{2})(?:(\d{2}))?|Z")
_TZ_ZZ_RE = re.compile(r"([\+\-])(\d{2})(?:\:(\d{2}))?|Z")
_TZ_NAME_RE = re.compile(r"\w[\w+\-/]+")
# NOTE: timestamps cannot be parsed from natural language strings (by removing the ^...$) because it will
# break cases like "15 Jul 2000" and a format list (see issue #447)
_TIMESTAMP_RE = re.compile(r"^-?\d+\.?\d+$")
_TIME_RE = re.compile(r"^(\d{2})(?:\:?(\d{2}))?(?:\:?(\d{2}))?(?:([\.\,])(\d+))?$")
_BASE_INPUT_RE_MAP = {
"YYYY": _FOUR_DIGIT_RE,
"YY": _TWO_DIGIT_RE,
"MM": _TWO_DIGIT_RE,
"M": _ONE_OR_TWO_DIGIT_RE,
"DDDD": _THREE_DIGIT_RE,
"DDD": _ONE_OR_TWO_OR_THREE_DIGIT_RE,
"DD": _TWO_DIGIT_RE,
"D": _ONE_OR_TWO_DIGIT_RE,
"HH": _TWO_DIGIT_RE,
"H": _ONE_OR_TWO_DIGIT_RE,
"hh": _TWO_DIGIT_RE,
"h": _ONE_OR_TWO_DIGIT_RE,
"mm": _TWO_DIGIT_RE,
"m": _ONE_OR_TWO_DIGIT_RE,
"ss": _TWO_DIGIT_RE,
"s": _ONE_OR_TWO_DIGIT_RE,
"X": _TIMESTAMP_RE,
"ZZZ": _TZ_NAME_RE,
"ZZ": _TZ_ZZ_RE,
"Z": _TZ_Z_RE,
"S": _ONE_OR_MORE_DIGIT_RE,
}
SEPARATORS = ["-", "/", "."]
def __init__(self, locale="en_us", cache_size=0):
self.locale = locales.get_locale(locale)
self._input_re_map = self._BASE_INPUT_RE_MAP.copy()
self._input_re_map.update(
{
"MMMM": self._generate_choice_re(
self.locale.month_names[1:], re.IGNORECASE
),
"MMM": self._generate_choice_re(
self.locale.month_abbreviations[1:], re.IGNORECASE
),
"Do": re.compile(self.locale.ordinal_day_re),
"dddd": self._generate_choice_re(
self.locale.day_names[1:], re.IGNORECASE
),
"ddd": self._generate_choice_re(
self.locale.day_abbreviations[1:], re.IGNORECASE
),
"d": re.compile(r"[1-7]"),
"a": self._generate_choice_re(
(self.locale.meridians["am"], self.locale.meridians["pm"])
),
# note: 'A' token accepts both 'am/pm' and 'AM/PM' formats to
# ensure backwards compatibility of this token
"A": self._generate_choice_re(self.locale.meridians.values()),
}
)
if cache_size > 0:
self._generate_pattern_re = lru_cache(maxsize=cache_size)(
self._generate_pattern_re
)
# TODO: since we support more than ISO-8601, we should rename this function
# IDEA: break into multiple functions
def parse_iso(self, datetime_string):
# TODO: add a flag to normalize whitespace (useful in logs, ref issue #421)
has_space_divider = " " in datetime_string
has_t_divider = "T" in datetime_string
num_spaces = datetime_string.count(" ")
if has_space_divider and num_spaces != 1 or has_t_divider and num_spaces > 0:
raise ParserError(
"Expected an ISO 8601-like string, but was given '{}'. Try passing in a format string to resolve this.".format(
datetime_string
)
)
has_time = has_space_divider or has_t_divider
has_tz = False
# date formats (ISO-8601 and others) to test against
# NOTE: YYYYMM is omitted to avoid confusion with YYMMDD (no longer part of ISO 8601, but is still often used)
formats = [
"YYYY-MM-DD",
"YYYY-M-DD",
"YYYY-M-D",
"YYYY/MM/DD",
"YYYY/M/DD",
"YYYY/M/D",
"YYYY.MM.DD",
"YYYY.M.DD",
"YYYY.M.D",
"YYYYMMDD",
"YYYY-DDDD",
"YYYYDDDD",
"YYYY-MM",
"YYYY/MM",
"YYYY.MM",
"YYYY",
]
if has_time:
if has_space_divider:
date_string, time_string = datetime_string.split(" ", 1)
else:
date_string, time_string = datetime_string.split("T", 1)
time_parts = re.split(r"[\+\-Z]", time_string, 1, re.IGNORECASE)
time_components = self._TIME_RE.match(time_parts[0])
if time_components is None:
raise ParserError(
"Invalid time component provided. Please specify a format or provide a valid time component in the basic or extended ISO 8601 time format."
)
hours, minutes, seconds, subseconds_sep, subseconds = (
time_components.groups()
)
has_tz = len(time_parts) == 2
has_minutes = minutes is not None
has_seconds = seconds is not None
has_subseconds = subseconds is not None
is_basic_time_format = ":" not in time_parts[0]
tz_format = "Z"
# use 'ZZ' token instead since tz offset is present in non-basic format
if has_tz and ":" in time_parts[1]:
tz_format = "ZZ"
time_sep = "" if is_basic_time_format else ":"
if has_subseconds:
time_string = "HH{time_sep}mm{time_sep}ss{subseconds_sep}S".format(
time_sep=time_sep, subseconds_sep=subseconds_sep
)
elif has_seconds:
time_string = "HH{time_sep}mm{time_sep}ss".format(time_sep=time_sep)
elif has_minutes:
time_string = "HH{time_sep}mm".format(time_sep=time_sep)
else:
time_string = "HH"
if has_space_divider:
formats = ["{} {}".format(f, time_string) for f in formats]
else:
formats = ["{}T{}".format(f, time_string) for f in formats]
if has_time and has_tz:
# Add "Z" or "ZZ" to the format strings to indicate to
# _parse_token() that a timezone needs to be parsed
formats = ["{}{}".format(f, tz_format) for f in formats]
return self._parse_multiformat(datetime_string, formats)
def parse(self, datetime_string, fmt):
if isinstance(fmt, list):
return self._parse_multiformat(datetime_string, fmt)
fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt)
match = fmt_pattern_re.search(datetime_string)
if match is None:
raise ParserMatchError(
"Failed to match '{}' when parsing '{}'".format(fmt, datetime_string)
)
parts = {}
for token in fmt_tokens:
if token == "Do":
value = match.group("value")
else:
value = match.group(token)
self._parse_token(token, value, parts)
return self._build_datetime(parts)
def _generate_pattern_re(self, fmt):
# fmt is a string of tokens like 'YYYY-MM-DD'
# we construct a new string by replacing each
# token by its pattern:
# 'YYYY-MM-DD' -> '(?P<YYYY>\d{4})-(?P<MM>\d{2})-(?P<DD>\d{2})'
tokens = []
offset = 0
# Escape all special RegEx chars
escaped_fmt = re.escape(fmt)
# Extract the bracketed expressions to be reinserted later.
escaped_fmt = re.sub(self._ESCAPE_RE, "#", escaped_fmt)
# Any number of S is the same as one.
# TODO: allow users to specify the number of digits to parse
escaped_fmt = re.sub(r"S+", "S", escaped_fmt)
escaped_data = re.findall(self._ESCAPE_RE, fmt)
fmt_pattern = escaped_fmt
for m in self._FORMAT_RE.finditer(escaped_fmt):
token = m.group(0)
try:
input_re = self._input_re_map[token]
except KeyError:
raise ParserError("Unrecognized token '{}'".format(token))
input_pattern = "(?P<{}>{})".format(token, input_re.pattern)
tokens.append(token)
# a pattern doesn't have the same length as the token
# it replaces! We keep the difference in the offset variable.
# This works because the string is scanned left-to-right and matches
# are returned in the order found by finditer.
fmt_pattern = (
fmt_pattern[: m.start() + offset]
+ input_pattern
+ fmt_pattern[m.end() + offset :]
)
offset += len(input_pattern) - (m.end() - m.start())
final_fmt_pattern = ""
split_fmt = fmt_pattern.split(r"\#")
# Due to the way Python splits, 'split_fmt' will always be longer
for i in range(len(split_fmt)):
final_fmt_pattern += split_fmt[i]
if i < len(escaped_data):
final_fmt_pattern += escaped_data[i][1:-1]
# Wrap final_fmt_pattern in a custom word boundary to strictly
# match the formatting pattern and filter out date and time formats
# that include junk such as: blah1998-09-12 blah, blah 1998-09-12blah,
# blah1998-09-12blah. The custom word boundary matches every character
# that is not a whitespace character to allow for searching for a date
# and time string in a natural language sentence. Therefore, searching
# for a string of the form YYYY-MM-DD in "blah 1998-09-12 blah" will
# work properly.
# Reference: https://stackoverflow.com/q/14232931/3820660
starting_word_boundary = r"(?<![\S])"
ending_word_boundary = r"(?![\S])"
bounded_fmt_pattern = r"{}{}{}".format(
starting_word_boundary, final_fmt_pattern, ending_word_boundary
)
return tokens, re.compile(bounded_fmt_pattern, flags=re.IGNORECASE)
def _parse_token(self, token, value, parts):
if token == "YYYY":
parts["year"] = int(value)
elif token == "YY":
value = int(value)
parts["year"] = 1900 + value if value > 68 else 2000 + value
elif token in ["MMMM", "MMM"]:
parts["month"] = self.locale.month_number(value.lower())
elif token in ["MM", "M"]:
parts["month"] = int(value)
elif token in ["DDDD", "DDD"]:
parts["day_of_year"] = int(value)
elif token in ["DD", "D"]:
parts["day"] = int(value)
elif token in ["Do"]:
parts["day"] = int(value)
elif token.upper() in ["HH", "H"]:
parts["hour"] = int(value)
elif token in ["mm", "m"]:
parts["minute"] = int(value)
elif token in ["ss", "s"]:
parts["second"] = int(value)
elif token == "S":
# We have the *most significant* digits of an arbitrary-precision integer.
# We want the six most significant digits as an integer, rounded.
# IDEA: add nanosecond support somehow? Need datetime support for it first.
value = value.ljust(7, str("0"))
# floating-point (IEEE-754) defaults to half-to-even rounding
seventh_digit = int(value[6])
if seventh_digit == 5:
rounding = int(value[5]) % 2
elif seventh_digit > 5:
rounding = 1
else:
rounding = 0
parts["microsecond"] = int(value[:6]) + rounding
elif token == "X":
parts["timestamp"] = float(value)
elif token in ["ZZZ", "ZZ", "Z"]:
parts["tzinfo"] = TzinfoParser.parse(value)
elif token in ["a", "A"]:
if value in (self.locale.meridians["am"], self.locale.meridians["AM"]):
parts["am_pm"] = "am"
elif value in (self.locale.meridians["pm"], self.locale.meridians["PM"]):
parts["am_pm"] = "pm"
@staticmethod
def _build_datetime(parts):
timestamp = parts.get("timestamp")
if timestamp is not None:
return datetime.fromtimestamp(timestamp, tz=tz.tzutc())
day_of_year = parts.get("day_of_year")
if day_of_year is not None:
year = parts.get("year")
month = parts.get("month")
if year is None:
raise ParserError(
"Year component is required with the DDD and DDDD tokens."
)
if month is not None:
raise ParserError(
"Month component is not allowed with the DDD and DDDD tokens."
)
date_string = "{}-{}".format(year, day_of_year)
try:
dt = datetime.strptime(date_string, "%Y-%j")
except ValueError:
raise ParserError(
"The provided day of year '{}' is invalid.".format(day_of_year)
)
parts["year"] = dt.year
parts["month"] = dt.month
parts["day"] = dt.day
am_pm = parts.get("am_pm")
hour = parts.get("hour", 0)
if am_pm == "pm" and hour < 12:
hour += 12
elif am_pm == "am" and hour == 12:
hour = 0
# account for rounding up to 1000000
microsecond = parts.get("microsecond", 0)
if microsecond == 1000000:
microsecond = 0
second_increment = 1
else:
second_increment = 0
increment = timedelta(seconds=second_increment)
return (
datetime(
year=parts.get("year", 1),
month=parts.get("month", 1),
day=parts.get("day", 1),
hour=hour,
minute=parts.get("minute", 0),
second=parts.get("second", 0),
microsecond=microsecond,
tzinfo=parts.get("tzinfo"),
)
+ increment
)
def _parse_multiformat(self, string, formats):
_datetime = None
for fmt in formats:
try:
_datetime = self.parse(string, fmt)
break
except ParserMatchError:
pass
if _datetime is None:
raise ParserError(
"Could not match input '{}' to any of the following formats: {}".format(
string, ", ".join(formats)
)
)
return _datetime
# generates a capture group of choices separated by an OR operator
@staticmethod
def _generate_choice_re(choices, flags=0):
return re.compile(r"({})".format("|".join(choices)), flags=flags)
class TzinfoParser(object):
# TODO: test against full timezone DB
_TZINFO_RE = re.compile(r"^([\+\-])?(\d{2})(?:\:?(\d{2}))?$")
@classmethod
def parse(cls, tzinfo_string):
tzinfo = None
if tzinfo_string == "local":
tzinfo = tz.tzlocal()
elif tzinfo_string in ["utc", "UTC", "Z"]:
tzinfo = tz.tzutc()
else:
iso_match = cls._TZINFO_RE.match(tzinfo_string)
if iso_match:
sign, hours, minutes = iso_match.groups()
if minutes is None:
minutes = 0
seconds = int(hours) * 3600 + int(minutes) * 60
if sign == "-":
seconds *= -1
tzinfo = tz.tzoffset(None, seconds)
else:
tzinfo = tz.gettz(tzinfo_string)
if tzinfo is None:
raise ParserError(
'Could not parse timezone expression "{}"'.format(tzinfo_string)
)
return tzinfo