Skip to content

Commit

Permalink
order_languages: use the latest order from W3Techs (#1081)
Browse files Browse the repository at this point in the history
  • Loading branch information
serhii73 committed Oct 13, 2022
1 parent 62a2a84 commit 484d4fb
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 70 deletions.
120 changes: 60 additions & 60 deletions dateparser/data/languages_info.py
Expand Up @@ -2,43 +2,42 @@
"en",
"ru",
"es",
"de",
"tr",
"fa",
"fr",
"de",
"fa",
"ja",
"pt",
"vi",
"zh",
"zh-Hans",
"ar",
"vi",
"it",
"nl",
"pt",
"ar",
"pl",
"id",
"el",
"nl",
"ko",
"uk",
"th",
"he",
"uk",
"cs",
"sv",
"ro",
"hu",
"el",
"da",
"hu",
"fi",
"sr",
"sr-Cyrl",
"sk",
"fi",
"bg",
"nb",
"hr",
"lt",
"hi",
"nb",
"sl",
"nn",
"ca",
"et",
"lv",
"bn",
"ur",
"sw",
Expand Down Expand Up @@ -83,7 +82,6 @@
"az",
"az-Latn",
"af",
"ca",
"sr-Latn",
"ii",
"bm",
Expand Down Expand Up @@ -136,8 +134,10 @@
"mk",
"uz-Arab",
"mas",
"nn",
"kde",
"mfe",
"lv",
"seh",
"mgh",
"az-Cyrl",
Expand Down Expand Up @@ -936,12 +936,17 @@
"es-UY",
"es-VE"
],
"de": [
"de-AT",
"de-BE",
"de-CH",
"de-IT",
"de-LI",
"de-LU"
],
"tr": [
"tr-CY"
],
"fa": [
"fa-AF"
],
"fr": [
"fr-BE",
"fr-BF",
Expand Down Expand Up @@ -989,15 +994,30 @@
"fr-WF",
"fr-YT"
],
"de": [
"de-AT",
"de-BE",
"de-CH",
"de-IT",
"de-LI",
"de-LU"
"fa": [
"fa-AF"
],
"ja": [],
"zh": [],
"zh-Hans": [
"zh-Hans-HK",
"zh-Hans-MO",
"zh-Hans-SG"
],
"vi": [],
"it": [
"it-CH",
"it-SM",
"it-VA"
],
"nl": [
"nl-AW",
"nl-BE",
"nl-BQ",
"nl-CW",
"nl-SR",
"nl-SX"
],
"pt": [
"pt-AO",
"pt-CH",
Expand All @@ -1011,13 +1031,6 @@
"pt-ST",
"pt-TL"
],
"vi": [],
"zh": [],
"zh-Hans": [
"zh-Hans-HK",
"zh-Hans-MO",
"zh-Hans-SG"
],
"ar": [
"ar-AE",
"ar-BH",
Expand Down Expand Up @@ -1047,30 +1060,14 @@
"ar-TN",
"ar-YE"
],
"it": [
"it-CH",
"it-SM",
"it-VA"
],
"pl": [],
"id": [],
"el": [
"el-CY"
],
"nl": [
"nl-AW",
"nl-BE",
"nl-BQ",
"nl-CW",
"nl-SR",
"nl-SX"
],
"ko": [
"ko-KP"
],
"uk": [],
"th": [],
"he": [],
"uk": [],
"cs": [],
"sv": [
"sv-AX",
Expand All @@ -1079,31 +1076,37 @@
"ro": [
"ro-MD"
],
"hu": [],
"el": [
"el-CY"
],
"da": [
"da-GL"
],
"hu": [],
"fi": [],
"sr": [],
"sr-Cyrl": [
"sr-Cyrl-BA",
"sr-Cyrl-ME",
"sr-Cyrl-XK"
],
"sk": [],
"fi": [],
"bg": [],
"nb": [
"nb-SJ"
],
"hr": [
"hr-BA"
],
"lt": [],
"hi": [],
"nb": [
"nb-SJ"
],
"sl": [],
"nn": [],
"ca": [
"ca-AD",
"ca-FR",
"ca-IT"
],
"et": [],
"lv": [],
"bn": [
"bn-IN"
],
Expand Down Expand Up @@ -1188,11 +1191,6 @@
"af": [
"af-NA"
],
"ca": [
"ca-AD",
"ca-FR",
"ca-IT"
],
"sr-Latn": [
"sr-Latn-BA",
"sr-Latn-ME",
Expand Down Expand Up @@ -1273,8 +1271,10 @@
"mas": [
"mas-TZ"
],
"nn": [],
"kde": [],
"mfe": [],
"lv": [],
"seh": [],
"mgh": [],
"az-Cyrl": [],
Expand Down
71 changes: 63 additions & 8 deletions dateparser_scripts/order_languages.py
Expand Up @@ -5,20 +5,14 @@
import regex as re

from dateparser_scripts.utils import get_raw_data
from parsel import Selector
import requests

os.chdir(os.path.dirname(os.path.abspath(__file__)))

# Languages with insufficient translation data are excluded
avoid_languages = {'cu', 'kkj', 'nds', 'prg', 'tk', 'vai', 'vai-Latn', 'vai-Vaii', 'vo'}

# Order from https://w3techs.com/technologies/overview/content_language
# Last updated on 30.10.2020
most_common_locales = [
'en', 'ru', 'es', 'tr', 'fa', 'fr', 'de', 'ja', 'pt', 'vi', 'zh', 'ar', 'it', 'pl', 'id', 'el',
'nl', 'ko', 'th', 'he', 'uk', 'cs', 'sv', 'ro', 'hu', 'da', 'sr', 'sk', 'fi', 'bg', 'hr', 'lt',
'hi', 'nb', 'sl', 'nn', 'et', 'lv'
]


def _get_language_locale_dict():
cldr_dates_full_dir = "../raw_data/cldr_dates_full/main/"
Expand All @@ -40,6 +34,66 @@ def _get_language_locale_dict():


def _get_language_order(language_locale_dict):
def get_most_common_locales():
# Order from https://w3techs.com/technologies/overview/content_language
# Last updated on 03.10.2022
old_common_locales = ['en',
'ru',
'es',
'de',
'tr',
'fr',
'fa',
'ja',
'zh',
'vi',
'it',
'nl',
'pt',
'ar',
'pl',
'id',
'ko',
'uk',
'th',
'he',
'cs',
'sv',
'ro',
'el',
'da',
'hu',
'fi',
'sr',
'sk',
'bg',
'nb',
'hr',
'lt',
'no',
'hi',
'sl',
'ca',
'et']

response = requests.get('https://w3techs.com/technologies/overview/content_language')
sel = Selector(text=response.text)
if response.ok:
try:
bars = sel.xpath("//table[@class='bars']//a/@href").getall()
if not bars:
raise ValueError("No bars found")
new_most_common_locales = [i.replace('https://w3techs.com/technologies/details/cl', '').strip('-') for i in bars]
if new_most_common_locales[0] != 'en':
raise ValueError("English is not the first language")
except Exception as e:
print(e)
print("The website could have changed, please update the code")
return old_common_locales
else:
return old_common_locales
return new_most_common_locales

territory_info_file = "../raw_data/cldr_core/supplemental/territoryInfo.json"
with open(territory_info_file) as f:
territory_content = json.load(f)
Expand All @@ -59,6 +113,7 @@ def _get_language_order(language_locale_dict):
except Exception:
pass

most_common_locales = get_most_common_locales()
language_order_with_duplicates = (
most_common_locales
+ sorted(
Expand Down
2 changes: 2 additions & 0 deletions dateparser_scripts/requirements.txt
@@ -1,2 +1,4 @@
gitpython
parsel
requests
ruamel.yaml
4 changes: 2 additions & 2 deletions tests/test_loading.py
Expand Up @@ -142,9 +142,9 @@ def test_get_locale_map_with_given_order(self, given_locales):
param(given_locales=['en-FJ', 'pt-CV', 'fr-RW'],
expected_locales=['en-FJ', 'fr-RW', 'pt-CV']),
param(given_locales=['pt-AO', 'hi', 'zh-Hans-SG', 'vi'],
expected_locales=['pt-AO', 'vi', 'zh-Hans-SG', 'hi']),
expected_locales=['zh-Hans-SG', 'vi', 'pt-AO', 'hi']),
param(given_locales=['gsw-FR', 'es-BZ', 'ca-IT', 'qu-EC'],
expected_locales=['es-BZ', 'qu-EC', 'ca-IT', 'gsw-FR']),
expected_locales=['es-BZ', 'ca-IT', 'qu-EC', 'gsw-FR']),
])
def test_get_locale_map_without_given_order(self, given_locales, expected_locales):
self.given_locale_map(locales=given_locales, use_given_order=False)
Expand Down

0 comments on commit 484d4fb

Please sign in to comment.