Merge branch 'master' into parse_unix_timestamps

scrapinghub · Jun 21, 2022 · bc5ae4d · bc5ae4d
2 parents 3faf65f + 0ed979e
commit bc5ae4d
Show file tree

Hide file tree

Showing 44 changed files with 1,468 additions and 92 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -45,7 +45,7 @@ jobs:
         with:
           python-version: '${{ matrix.python-version }}'
       - name: Install language-pack-fr
-        run: sudo apt-get install language-pack-fr
+        run: sudo apt-get update && sudo apt-get install language-pack-fr
       - name: Install python dependencies
         run: pip install tox
       - name: tox

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,27 @@
+name: Upload Python Package
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.x"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install setuptools wheel twine
+      - name: Build and publish
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          python setup.py sdist bdist_wheel
+          twine upload dist/*
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 *.py[cod]
+.python-version
 
 # C extensions
 *.so
@@ -47,6 +48,7 @@ docs/_build
 # Editors
 *.swp
 .idea
+.vscode/
 
 # Other
 raw_data
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -89,7 +89,7 @@ tests, including testing other Python versions with tox::
 
     $ tox
 
-   To get flake8 and tox, just pip install them into your virtualenv. (Note that we use ``max-line-length = 100`` for flake8, this is configured in ``setup.cfg`` file.)
+   To get ``tox``, just ``pip install`` it into your virtualenv. In addition to tests, ``tox`` checks for code style and maximum line length (119 characters).
 
 6. Commit your changes and push your branch to GitHub::
 
@@ -169,7 +169,7 @@ language, you must:
    :hidden:
 
    template
-   
+
 Updating the List of Supported Languages and Locales
 ----------------------------------------------------
 

diff --git a/HISTORY.rst b/HISTORY.rst
@@ -3,6 +3,50 @@
 History
 =======
 
+1.1.1 (2022-03-17)
+------------------
+
+Improvements:
+
+- Fixed issue with regex library by pinning dependencies to an earlier version (< 2022.3.15, #1046).
+- Extended support for Russian language dates starting with lowercase (#999).
+- Allowed to use_given_order for languages too (#997).
+- Fixed link to settings section (#1018).
+- Defined UTF-8 encoding for Windows (#998).
+- Fixed directories creation error in CLI utils (#1022).
+
+
+1.1.0 (2021-10-04)
+------------------
+
+New features:
+
+* Support language detection based on ``langdetect``, ``fastText``, or a
+  custom implementation (see #932)
+* Add support for 'by <time>' (see #839)
+* Sort default language list by internet usage (see #805)
+
+Improvements:
+
+* Improved support of Chinese (#910), Czech (#977)
+* Improvements in ``search_dates`` (see #953)
+* Make order of previous locales deterministic (see #851)
+* Fix parsing with trailing space (see #841)
+* Consider ``RETURN_TIME_AS_PERIOD`` for timestamp times (see #922)
+* Exclude failing regex version (see #974)
+* Ongoing work multithreading support (see #881, #885)
+* Add demo URL (see #883)
+
+QA:
+
+* Migrate pipelines from Travis CI to Github Actions (see #859, #879, #884,
+  #886, #911, #966)
+* Use versioned CLDR data (see #825)
+* Add a script to update table of supported languages and locales (see #601)
+* Sort 'skip' keys in yaml files (see #844)
+* Improve test coverage (see #827)
+* Code cleanup (see #888, #907, #951, #958, #957)
+
 
 1.0.0 (2020-10-29)
 ------------------

diff --git a/README.rst b/README.rst
@@ -133,7 +133,7 @@ You can control multiple behaviors by using the ``settings`` parameter:
     datetime.datetime(1992, 1, 2, 0, 0)
 
 To see more examples on how to use the ``settings``, check the `settings
-section <https://dateparser.readthedocs.io/en/latest/usage.html#settings>`__
+section <https://dateparser.readthedocs.io/en/latest/settings.html>`__
 in the docs.
 
 False positives

diff --git a/dateparser/__init__.py b/dateparser/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.0.0'
+__version__ = '1.1.1'
 
 from .date import DateDataParser
 from .conf import apply_settings
@@ -7,7 +7,8 @@
 
 
 @apply_settings
-def parse(date_string, date_formats=None, languages=None, locales=None, region=None, settings=None):
+def parse(date_string, date_formats=None, languages=None, locales=None,
+          region=None, settings=None, detect_languages_function=None):
     """Parse date and time from given date string.
 
     :param date_string:
@@ -39,6 +40,12 @@ def parse(date_string, date_formats=None, languages=None, locales=None, region=N
         Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
     :type settings: dict
 
+    :param detect_languages_function:
+        A function for language detection that takes as input a string (the `date_string`) and
+        a `confidence_threshold`, and returns a list of detected language codes.
+        Note: this function is only used if ``languages`` and ``locales`` are not provided.
+    :type detect_languages_function: function
+
     :return: Returns :class:`datetime <datetime.datetime>` representing parsed date if successful, else returns None
     :rtype: :class:`datetime <datetime.datetime>`.
     :raises:
@@ -47,9 +54,9 @@ def parse(date_string, date_formats=None, languages=None, locales=None, region=N
     """
     parser = _default_parser
 
-    if languages or locales or region or not settings._default:
+    if languages or locales or region or detect_languages_function or not settings._default:
         parser = DateDataParser(languages=languages, locales=locales,
-                                region=region, settings=settings)
+                                region=region, settings=settings, detect_languages_function=detect_languages_function)
 
     data = parser.get_date_data(date_string, date_formats)
 

diff --git a/dateparser/conf.py b/dateparser/conf.py
@@ -2,6 +2,7 @@
 from datetime import datetime
 from functools import wraps
 
+from dateparser.data.languages_info import language_order
 from .parser import date_order_chart
 from .utils import registry
 
@@ -25,6 +26,8 @@ class Settings:
     * `NORMALIZE`
     * `RETURN_TIME_AS_PERIOD`
     * `PARSERS`
+    * `DEFAULT_LANGUAGES`
+    * `LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD`
     """
 
     _default = True
@@ -117,7 +120,7 @@ def _check_require_part(setting_name, setting_value):
 def _check_parsers(setting_name, setting_value):
     """Returns `True` if the provided list of parsers contains valid values"""
     existing_parsers = [
-        'timestamp', 'relative-time', 'custom-formats', 'absolute-time', 'no-spaces-time'
+        'timestamp', 'relative-time', 'custom-formats', 'absolute-time', 'no-spaces-time', 'negative-timestamp'
     ]  # FIXME: Extract the list of existing parsers from another place (#798)
     unknown_parsers = set(setting_value) - set(existing_parsers)
     if unknown_parsers:
@@ -129,6 +132,28 @@ def _check_parsers(setting_name, setting_value):
     _check_repeated_values(setting_name, setting_value)
 
 
+def _check_default_languages(setting_name, setting_value):
+    unsupported_languages = set(setting_value) - set(language_order)
+    if unsupported_languages:
+        raise SettingValidationError(
+            "Found invalid languages in the '{}' setting: {}".format(
+                setting_name, ', '.join(map(repr, unsupported_languages))
+            )
+        )
+    _check_repeated_values(setting_name, setting_value)
+
+
+def _check_between_0_and_1(setting_name, setting_value):
+    is_valid = 0 <= setting_value <= 1
+    if not is_valid:
+        raise SettingValidationError(
+            '{} is not a valid value for {}. It can take values between 0 and '
+            '1.'.format(
+                setting_value, setting_name,
+            )
+        )
+
+
 def check_settings(settings):
     """
     Check if provided settings are valid, if not it raises `SettingValidationError`.
@@ -193,6 +218,14 @@ def check_settings(settings):
         'PREFER_LOCALE_DATE_ORDER': {
             'type': bool
         },
+        'DEFAULT_LANGUAGES': {
+            'type': list,
+            'extra_check': _check_default_languages
+        },
+        'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': {
+            'type': float,
+            'extra_check': _check_between_0_and_1
+        },
     }
 
     modified_settings = settings._mod_settings  # check only modified settings

diff --git a/dateparser/custom_language_detection/__init__.py b/dateparser/custom_language_detection/__init__.py
diff --git a/dateparser/custom_language_detection/fasttext.py b/dateparser/custom_language_detection/fasttext.py
@@ -0,0 +1,45 @@
+import os
+
+import fasttext
+
+from dateparser_cli.fasttext_manager import fasttext_downloader
+from dateparser_cli.utils import dateparser_model_home, create_data_model_home
+from dateparser_cli.exceptions import FastTextModelNotFoundException
+
+
+_supported_models = ["large.bin", "small.bin"]
+_DEFAULT_MODEL = "small"
+
+
+class _FastTextCache:
+    model = None
+
+
+def _load_fasttext_model():
+    if _FastTextCache.model:
+        return _FastTextCache.model
+    create_data_model_home()
+    downloaded_models = [
+        file for file in os.listdir(dateparser_model_home)
+        if file in _supported_models
+    ]
+    if not downloaded_models:
+        fasttext_downloader(_DEFAULT_MODEL)
+        return _load_fasttext_model()
+    model_path = os.path.join(dateparser_model_home, downloaded_models[0])
+    if not os.path.isfile(model_path):
+        raise FastTextModelNotFoundException('Fasttext model file not found')
+    _FastTextCache.model = fasttext.load_model(model_path)
+    return _FastTextCache.model
+
+
+def detect_languages(text, confidence_threshold):
+    _language_parser = _load_fasttext_model()
+    text = text.replace('\n', ' ').replace('\r', '')
+    language_codes = []
+    parser_data = _language_parser.predict(text)
+    for idx, language_probability in enumerate(parser_data[1]):
+        if language_probability > confidence_threshold:
+            language_code = parser_data[0][idx].replace("__label__", "")
+            language_codes.append(language_code)
+    return language_codes
diff --git a/dateparser/custom_language_detection/langdetect.py b/dateparser/custom_language_detection/langdetect.py
@@ -0,0 +1,37 @@
+import langdetect
+
+
+# The below _Factory is set to prevent setting global state of the library
+# but still get consistent results.
+# Refer : https://github.com/Mimino666/langdetect
+
+class _Factory:
+    data = None
+
+
+def _init_factory():
+    if _Factory.data is None:
+        _Factory.data = langdetect.detector_factory.DetectorFactory()
+        _Factory.data.load_profile(langdetect.detector_factory.PROFILES_DIRECTORY)
+        _Factory.data.seed = 0
+
+
+def _get_language_probablities(text):
+    _init_factory()
+    detector = _Factory.data.create()
+    detector.append(text)
+    return detector.get_probabilities()
+
+
+def detect_languages(text, confidence_threshold):
+    language_codes = []
+    try:
+        parser_data = _get_language_probablities(text)
+        for language_candidate in parser_data:
+            if language_candidate.prob > confidence_threshold:
+                language_codes.append(language_candidate.lang)
+    except langdetect.lang_detect_exception.LangDetectException:
+        # This exception can be produced with empty strings or inputs without letters like `10-10-2021`.
+        # As this could be really common, we ignore them.
+        pass
+    return language_codes
diff --git a/dateparser/custom_language_detection/language_mapping.py b/dateparser/custom_language_detection/language_mapping.py
@@ -0,0 +1,18 @@
+from dateparser.data.languages_info import language_map
+
+
+def map_languages(language_codes):
+    """
+    Returns the candidates from the supported languages codes.
+    :param language_codes:
+        A list of language codes, e.g. ['en', 'es'] in ISO 639 Standard.
+    :type language_codes: list
+    :return: Returns list[str] representing supported languages
+    :rtype: list[str]
+    """
+    return [
+        language_code
+        for language in language_codes
+        if language in language_map
+        for language_code in language_map[language]
+    ]