Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve date/time parsing #834

Merged
merged 3 commits into from Jan 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
63 changes: 42 additions & 21 deletions babel/dates.py
Expand Up @@ -1138,7 +1138,11 @@ def get_period_id(time, tzinfo=None, type=None, locale=LC_TIME):
return "pm"


def parse_date(string, locale=LC_TIME):
class ParseError(ValueError):
pass


def parse_date(string, locale=LC_TIME, format='medium'):
"""Parse a date from a string.

This function uses the date format for the locale as a hint to determine
Expand All @@ -1151,14 +1155,19 @@ def parse_date(string, locale=LC_TIME):

:param string: the string containing the date
:param locale: a `Locale` object or a locale identifier
:param format: the format to use (see ``get_date_format``)
"""
numbers = re.findall(r'(\d+)', string)
if not numbers:
raise ParseError("No numbers were found in input")

# TODO: try ISO format first?
format = get_date_format(locale=locale).pattern.lower()
year_idx = format.index('y')
month_idx = format.index('m')
format_str = get_date_format(format=format, locale=locale).pattern.lower()
year_idx = format_str.index('y')
month_idx = format_str.index('m')
if month_idx < 0:
month_idx = format.index('l')
day_idx = format.index('d')
month_idx = format_str.index('l')
day_idx = format_str.index('d')

indexes = [(year_idx, 'Y'), (month_idx, 'M'), (day_idx, 'D')]
indexes.sort()
Expand All @@ -1167,7 +1176,6 @@ def parse_date(string, locale=LC_TIME):
# FIXME: this currently only supports numbers, but should also support month
# names, both in the requested locale, and english

numbers = re.findall(r'(\d+)', string)
year = numbers[indexes['Y']]
if len(year) == 2:
year = 2000 + int(year)
Expand All @@ -1180,7 +1188,7 @@ def parse_date(string, locale=LC_TIME):
return date(year, month, day)


def parse_time(string, locale=LC_TIME):
def parse_time(string, locale=LC_TIME, format='medium'):
"""Parse a time from a string.

This function uses the time format for the locale as a hint to determine
Expand All @@ -1191,29 +1199,42 @@ def parse_time(string, locale=LC_TIME):

:param string: the string containing the time
:param locale: a `Locale` object or a locale identifier
:param format: the format to use (see ``get_time_format``)
:return: the parsed time
:rtype: `time`
"""
numbers = re.findall(r'(\d+)', string)
if not numbers:
raise ParseError("No numbers were found in input")

# TODO: try ISO format first?
format = get_time_format(locale=locale).pattern.lower()
hour_idx = format.index('h')
format_str = get_time_format(format=format, locale=locale).pattern.lower()
hour_idx = format_str.index('h')
if hour_idx < 0:
hour_idx = format.index('k')
min_idx = format.index('m')
sec_idx = format.index('s')
hour_idx = format_str.index('k')
min_idx = format_str.index('m')
sec_idx = format_str.index('s')

indexes = [(hour_idx, 'H'), (min_idx, 'M'), (sec_idx, 'S')]
indexes.sort()
indexes = dict([(item[1], idx) for idx, item in enumerate(indexes)])

# FIXME: support 12 hour clock, and 0-based hour specification
# and seconds should be optional, maybe minutes too
# oh, and time-zones, of course

numbers = re.findall(r'(\d+)', string)
hour = int(numbers[indexes['H']])
minute = int(numbers[indexes['M']])
second = int(numbers[indexes['S']])
# TODO: support time zones

# Check if the format specifies a period to be used;
# if it does, look for 'pm' to figure out an offset.
hour_offset = 0
if 'a' in format_str:
if 'pm' in string.lower():
hour_offset = 12

# Parse up to three numbers from the string.
minute = second = 0
hour = int(numbers[indexes['H']]) + hour_offset
if len(numbers) > 1:
minute = int(numbers[indexes['M']])
if len(numbers) > 2:
second = int(numbers[indexes['S']])
return time(hour, minute, second)


Expand Down
35 changes: 31 additions & 4 deletions tests/test_dates.py
Expand Up @@ -775,10 +775,37 @@ def test_format_timedelta():
def test_parse_date():
assert dates.parse_date('4/1/04', locale='en_US') == date(2004, 4, 1)
assert dates.parse_date('01.04.2004', locale='de_DE') == date(2004, 4, 1)


def test_parse_time():
assert dates.parse_time('15:30:00', locale='en_US') == time(15, 30)
assert dates.parse_date('2004-04-01', locale='sv_SE', format='short') == date(2004, 4, 1)


@pytest.mark.parametrize('input, expected', [
# base case, fully qualified time
('15:30:00', time(15, 30)),
# test digits
('15:30', time(15, 30)),
('3:30', time(3, 30)),
('00:30', time(0, 30)),
# test am parsing
('03:30 am', time(3, 30)),
('3:30:21 am', time(3, 30, 21)),
('3:30 am', time(3, 30)),
# test pm parsing
('03:30 pm', time(15, 30)),
('03:30 pM', time(15, 30)),
('03:30 Pm', time(15, 30)),
('03:30 PM', time(15, 30)),
# test hour-only parsing
('4 pm', time(16, 0)),
])
def test_parse_time(input, expected):
assert dates.parse_time(input, locale='en_US') == expected


@pytest.mark.parametrize('case', ['', 'a', 'aaa'])
@pytest.mark.parametrize('func', [dates.parse_date, dates.parse_time])
def test_parse_errors(case, func):
with pytest.raises(dates.ParseError):
func(case, locale='en_US')


def test_datetime_format_get_week_number():
Expand Down