Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a script that changes the order of languages #1081

Merged
merged 4 commits into from Oct 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
120 changes: 60 additions & 60 deletions dateparser/data/languages_info.py
Expand Up @@ -2,43 +2,42 @@
"en",
"ru",
"es",
"de",
"tr",
"fa",
"fr",
"de",
"fa",
"ja",
"pt",
"vi",
"zh",
"zh-Hans",
"ar",
"vi",
"it",
"nl",
"pt",
"ar",
"pl",
"id",
"el",
"nl",
"ko",
"uk",
"th",
"he",
"uk",
"cs",
"sv",
"ro",
"hu",
"el",
"da",
"hu",
"fi",
"sr",
"sr-Cyrl",
"sk",
"fi",
"bg",
"nb",
"hr",
"lt",
"hi",
"nb",
"sl",
"nn",
"ca",
"et",
"lv",
"bn",
"ur",
"sw",
Expand Down Expand Up @@ -83,7 +82,6 @@
"az",
"az-Latn",
"af",
"ca",
"sr-Latn",
"ii",
"bm",
Expand Down Expand Up @@ -136,8 +134,10 @@
"mk",
"uz-Arab",
"mas",
"nn",
"kde",
"mfe",
"lv",
"seh",
"mgh",
"az-Cyrl",
Expand Down Expand Up @@ -936,12 +936,17 @@
"es-UY",
"es-VE"
],
"de": [
"de-AT",
"de-BE",
"de-CH",
"de-IT",
"de-LI",
"de-LU"
],
"tr": [
"tr-CY"
],
"fa": [
"fa-AF"
],
"fr": [
"fr-BE",
"fr-BF",
Expand Down Expand Up @@ -989,15 +994,30 @@
"fr-WF",
"fr-YT"
],
"de": [
"de-AT",
"de-BE",
"de-CH",
"de-IT",
"de-LI",
"de-LU"
"fa": [
"fa-AF"
],
"ja": [],
"zh": [],
"zh-Hans": [
"zh-Hans-HK",
"zh-Hans-MO",
"zh-Hans-SG"
],
"vi": [],
"it": [
"it-CH",
"it-SM",
"it-VA"
],
"nl": [
"nl-AW",
"nl-BE",
"nl-BQ",
"nl-CW",
"nl-SR",
"nl-SX"
],
"pt": [
"pt-AO",
"pt-CH",
Expand All @@ -1011,13 +1031,6 @@
"pt-ST",
"pt-TL"
],
"vi": [],
"zh": [],
"zh-Hans": [
"zh-Hans-HK",
"zh-Hans-MO",
"zh-Hans-SG"
],
"ar": [
"ar-AE",
"ar-BH",
Expand Down Expand Up @@ -1047,30 +1060,14 @@
"ar-TN",
"ar-YE"
],
"it": [
"it-CH",
"it-SM",
"it-VA"
],
"pl": [],
"id": [],
"el": [
"el-CY"
],
"nl": [
"nl-AW",
"nl-BE",
"nl-BQ",
"nl-CW",
"nl-SR",
"nl-SX"
],
"ko": [
"ko-KP"
],
"uk": [],
"th": [],
"he": [],
"uk": [],
"cs": [],
"sv": [
"sv-AX",
Expand All @@ -1079,31 +1076,37 @@
"ro": [
"ro-MD"
],
"hu": [],
"el": [
"el-CY"
],
"da": [
"da-GL"
],
"hu": [],
"fi": [],
"sr": [],
"sr-Cyrl": [
"sr-Cyrl-BA",
"sr-Cyrl-ME",
"sr-Cyrl-XK"
],
"sk": [],
"fi": [],
"bg": [],
"nb": [
"nb-SJ"
],
"hr": [
"hr-BA"
],
"lt": [],
"hi": [],
"nb": [
"nb-SJ"
],
"sl": [],
"nn": [],
"ca": [
"ca-AD",
"ca-FR",
"ca-IT"
],
"et": [],
"lv": [],
"bn": [
"bn-IN"
],
Expand Down Expand Up @@ -1188,11 +1191,6 @@
"af": [
"af-NA"
],
"ca": [
"ca-AD",
"ca-FR",
"ca-IT"
],
"sr-Latn": [
"sr-Latn-BA",
"sr-Latn-ME",
Expand Down Expand Up @@ -1273,8 +1271,10 @@
"mas": [
"mas-TZ"
],
"nn": [],
"kde": [],
"mfe": [],
"lv": [],
"seh": [],
"mgh": [],
"az-Cyrl": [],
Expand Down
71 changes: 63 additions & 8 deletions dateparser_scripts/order_languages.py
Expand Up @@ -5,20 +5,14 @@
import regex as re

from dateparser_scripts.utils import get_raw_data
from parsel import Selector
import requests

os.chdir(os.path.dirname(os.path.abspath(__file__)))

# Languages with insufficient translation data are excluded
avoid_languages = {'cu', 'kkj', 'nds', 'prg', 'tk', 'vai', 'vai-Latn', 'vai-Vaii', 'vo'}

# Order from https://w3techs.com/technologies/overview/content_language
# Last updated on 30.10.2020
most_common_locales = [
'en', 'ru', 'es', 'tr', 'fa', 'fr', 'de', 'ja', 'pt', 'vi', 'zh', 'ar', 'it', 'pl', 'id', 'el',
'nl', 'ko', 'th', 'he', 'uk', 'cs', 'sv', 'ro', 'hu', 'da', 'sr', 'sk', 'fi', 'bg', 'hr', 'lt',
'hi', 'nb', 'sl', 'nn', 'et', 'lv'
]


def _get_language_locale_dict():
cldr_dates_full_dir = "../raw_data/cldr_dates_full/main/"
Expand All @@ -40,6 +34,66 @@ def _get_language_locale_dict():


def _get_language_order(language_locale_dict):
def get_most_common_locales():
# Order from https://w3techs.com/technologies/overview/content_language
# Last updated on 03.10.2022
old_common_locales = ['en',
'ru',
'es',
'de',
'tr',
'fr',
'fa',
'ja',
'zh',
'vi',
'it',
'nl',
'pt',
'ar',
'pl',
'id',
'ko',
'uk',
'th',
'he',
'cs',
'sv',
'ro',
'el',
'da',
'hu',
'fi',
'sr',
'sk',
'bg',
'nb',
'hr',
'lt',
'no',
'hi',
'sl',
'ca',
'et']

response = requests.get('https://w3techs.com/technologies/overview/content_language')
sel = Selector(text=response.text)
if response.ok:
try:
bars = sel.xpath("//table[@class='bars']//a/@href").getall()
if not bars:
raise ValueError("No bars found")
new_most_common_locales = [i.replace('https://w3techs.com/technologies/details/cl', '').strip('-') for i in bars]
if new_most_common_locales[0] != 'en':
raise ValueError("English is not the first language")
except Exception as e:
print(e)
print("The website could have changed, please update the code")
return old_common_locales
else:
return old_common_locales
return new_most_common_locales

territory_info_file = "../raw_data/cldr_core/supplemental/territoryInfo.json"
with open(territory_info_file) as f:
territory_content = json.load(f)
Expand All @@ -59,6 +113,7 @@ def _get_language_order(language_locale_dict):
except Exception:
pass

most_common_locales = get_most_common_locales()
language_order_with_duplicates = (
most_common_locales
+ sorted(
Expand Down
2 changes: 2 additions & 0 deletions dateparser_scripts/requirements.txt
@@ -1,2 +1,4 @@
gitpython
parsel
requests
ruamel.yaml
4 changes: 2 additions & 2 deletions tests/test_loading.py
Expand Up @@ -142,9 +142,9 @@ def test_get_locale_map_with_given_order(self, given_locales):
param(given_locales=['en-FJ', 'pt-CV', 'fr-RW'],
expected_locales=['en-FJ', 'fr-RW', 'pt-CV']),
param(given_locales=['pt-AO', 'hi', 'zh-Hans-SG', 'vi'],
expected_locales=['pt-AO', 'vi', 'zh-Hans-SG', 'hi']),
expected_locales=['zh-Hans-SG', 'vi', 'pt-AO', 'hi']),
param(given_locales=['gsw-FR', 'es-BZ', 'ca-IT', 'qu-EC'],
expected_locales=['es-BZ', 'qu-EC', 'ca-IT', 'gsw-FR']),
expected_locales=['es-BZ', 'ca-IT', 'qu-EC', 'gsw-FR']),
])
def test_get_locale_map_without_given_order(self, given_locales, expected_locales):
self.given_locale_map(locales=given_locales, use_given_order=False)
Expand Down