From 60d80038b8946fde3e08af8ea91e821202a5dbe8 Mon Sep 17 00:00:00 2001 From: Steven Willis Date: Tue, 27 Jul 2021 18:58:11 -0400 Subject: [PATCH 1/3] test coverage for how unix timestamps are parsed --- tests/test_date.py | 66 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/test_date.py b/tests/test_date.py index 70dff744a..c1d87fa64 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -1,9 +1,12 @@ #!/usr/bin/env python +import os import unittest from collections import OrderedDict from copy import copy -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone as dttz +from itertools import product +from time import tzset from unittest.mock import Mock, patch from parameterized import parameterized, param @@ -710,6 +713,38 @@ def then_date_object_is_invalid(self): class TestTimestampParser(BaseTestCase): + def given_parser(self, **params): + self.parser = date.DateDataParser(**params) + + def given_tzstr(self, tzstr): + # Save the existing value + self.old_tzstr = os.environ['TZ'] if 'TZ' in os.environ else None + + # Overwrite the value, or remove it + if tzstr is not None: + os.environ['TZ'] = tzstr + elif 'TZ' in os.environ: + del os.environ['TZ'] + + # Call tzset + tzset() + + def reset_tzstr(self): + # If we never set it with given_tzstr, don't bother resetting it + if not hasattr(self, 'old_tzstr'): + return + + # Restore the old value, or remove it if null + if self.old_tzstr is not None: + os.environ['TZ'] = self.old_tzstr + elif 'TZ' in os.environ: + del os.environ['TZ'] + + # Remove the local attribute + del self.old_tzstr + + # Restore the old timezone behavior + tzset() def test_timestamp_in_milliseconds(self): self.assertEqual( @@ -717,6 +752,35 @@ def test_timestamp_in_milliseconds(self): datetime.fromtimestamp(1570308760).replace(microsecond=263000) ) + @parameterized.expand( + product( + ['1570308760'], + ['EDT', 'EST', 'PDT', 'PST', 'UTC', 'local'], + [None, 'EDT', 'EST', 'PDT', 'PST', 'UTC'], + ['EST5EDT4', 'UTC0', 'PST8PDT7', None], + ) + ) + def test_timestamp_with_different_timestr(self, timestamp, timezone, to_timezone, tzstr): + settings = { + 'RETURN_AS_TIMEZONE_AWARE': True, + 'TIMEZONE': timezone, + } + + # is TO_TIMEZONE supposed to be allowed to be False, or None ??? + if to_timezone is not None: + settings['TO_TIMEZONE'] = to_timezone + + self.given_parser(settings=settings) + + self.given_tzstr(tzstr) + + self.assertEqual( + self.parser.get_date_data(timestamp)['date_obj'], + datetime.fromtimestamp(int(timestamp), dttz.utc) + ) + + self.reset_tzstr() + def test_timestamp_in_microseconds(self): self.assertEqual( date.get_date_from_timestamp('1570308760263111', None), From 3faf65f9a14640439782ae7a918c26790f8c3fb8 Mon Sep 17 00:00:00 2001 From: Steven Willis Date: Tue, 27 Jul 2021 14:40:35 -0400 Subject: [PATCH 2/3] Consider the local tz when creating a datetime object from timestamp Otherwise datetime.fromtimestamp will return a naive datetime in the local timezone and applying timezone modifications later with TIMEZONE and TO_TIMEZONE won't do the right thing. Parsing a unix timestamp value should always result in the exact same instant in time regardless of current time zone, so it shouldn't matter what the current TIMEZONE setting is, whether 'UTC', 'local', any other timezone, or even unset. --- dateparser/date.py | 15 ++++++++++++--- dateparser/utils/__init__.py | 19 ++++++++++--------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/dateparser/date.py b/dateparser/date.py index 01e269554..8cc3d2d57 100644 --- a/dateparser/date.py +++ b/dateparser/date.py @@ -4,6 +4,7 @@ from datetime import datetime, timedelta import regex as re +from tzlocal import get_localzone from dateutil.relativedelta import relativedelta from dateparser.date_parser import date_parser @@ -13,7 +14,8 @@ from dateparser.parser import _parse_absolute, _parse_nospaces from dateparser.timezone_parser import pop_tz_offset_from_string from dateparser.utils import apply_timezone_from_settings, \ - set_correct_day_from_settings + set_correct_day_from_settings, \ + get_timezone_from_tz_string APOSTROPHE_LOOK_ALIKE_CHARS = [ '\N{RIGHT SINGLE QUOTATION MARK}', # '\u2019' @@ -114,11 +116,18 @@ def sanitize_date(date_string): def get_date_from_timestamp(date_string, settings): match = RE_SEARCH_TIMESTAMP.search(date_string) if match: + if settings is not None and settings.TIMEZONE is not None and 'local' not in settings.TIMEZONE.lower(): + local_timezone = get_timezone_from_tz_string(settings.TIMEZONE) + else: + local_timezone = get_localzone() + seconds = int(match.group(1)) millis = int(match.group(2) or 0) micros = int(match.group(3) or 0) - date_obj = datetime.fromtimestamp(seconds) - date_obj = date_obj.replace(microsecond=millis * 1000 + micros) + date_obj = (datetime + .fromtimestamp(seconds, local_timezone) + .replace(microsecond=millis * 1000 + micros, tzinfo=None) + ) date_obj = apply_timezone_from_settings(date_obj, settings) return date_obj diff --git a/dateparser/utils/__init__.py b/dateparser/utils/__init__.py index 70b2fae48..b3ae557e7 100644 --- a/dateparser/utils/__init__.py +++ b/dateparser/utils/__init__.py @@ -65,22 +65,23 @@ def _get_missing_parts(fmt): return missing -def localize_timezone(date_time, tz_string): - if date_time.tzinfo: - return date_time - - tz = None - +def get_timezone_from_tz_string(tz_string): try: - tz = timezone(tz_string) + return timezone(tz_string) except UnknownTimeZoneError as e: for name, info in _tz_offsets: if info['regex'].search(' %s' % tz_string): - tz = StaticTzInfo(name, info['offset']) - break + return StaticTzInfo(name, info['offset']) else: raise e + +def localize_timezone(date_time, tz_string): + if date_time.tzinfo: + return date_time + + tz = get_timezone_from_tz_string(tz_string) + if hasattr(tz, 'localize'): date_time = tz.localize(date_time) else: From 693464880b197694c7bc2734422e843cfbe698c4 Mon Sep 17 00:00:00 2001 From: Steven Willis Date: Fri, 2 Sep 2022 22:19:40 -0400 Subject: [PATCH 3/3] Clarify logic for timezones when parsing unix timestamps --- dateparser/date.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dateparser/date.py b/dateparser/date.py index c24ec5c77..6124e53ef 100644 --- a/dateparser/date.py +++ b/dateparser/date.py @@ -119,19 +119,24 @@ def get_date_from_timestamp(date_string, settings, negative=False): if negative: match = RE_SEARCH_NEGATIVE_TIMESTAMP.search(date_string) else: - match = RE_SEARCH_TIMESTAMP.search(date_string) + match = RE_SEARCH_TIMESTAMP.search(date_string) if match: - if settings is not None and settings.TIMEZONE is not None and 'local' not in settings.TIMEZONE.lower(): - local_timezone = get_timezone_from_tz_string(settings.TIMEZONE) + if (settings is None or + settings.TIMEZONE is None or + 'local' in settings.TIMEZONE.lower()): + # If the timezone in settings is unset, or it's 'local', use the + # local timezone + timezone = get_localzone() else: - local_timezone = get_localzone() + # Otherwise, use the timezone given in settings + timezone = get_timezone_from_tz_string(settings.TIMEZONE) seconds = int(match.group(1)) millis = int(match.group(2) or 0) micros = int(match.group(3) or 0) date_obj = (datetime - .fromtimestamp(seconds, local_timezone) + .fromtimestamp(seconds, timezone) .replace(microsecond=millis * 1000 + micros, tzinfo=None) ) date_obj = apply_timezone_from_settings(date_obj, settings)