Faker optimizations as described in README.rst (joke2k#1348)

* Add micro-benchmark to make it easy to detect regressions. * Faker optimizations as described in README.rst . * Remove cruft * Flake8, bugfix * Remove use_internal_caches Use tuples instead of lists in several places Move fakebench so it doesn't run during tests Unit tests * Include benchmark * Improve arg naming * Tighten up type * Remove benchmark * Revert "Improve arg naming" This reverts commit d867c31. * Remove invalid Dutch surname (joke2k#1347) (joke2k#1349) * Update CHANGELOG.md * Bump version: 5.0.1 → 5.0.2 * fix: typo faker_locale -> faker_seed in pytest-fixtures (joke2k#1351) * en_IN person provider (joke2k#1355) * en_IN person provider * en_IN person provider linted * en_IN person providers removed formats * Update CHANGELOG.md * Bump version: 5.0.2 → 5.1.0 * fix typo in changelog * en_IN address + phone number provider (joke2k#1357) * en_IN person provider * en_IN person provider linted * en_IN person providers removed formats * en_IN address+phone_number added * test_address linted * Update CHANGELOG.md * Bump version: 5.1.0 → 5.2.0 * FW and JSON Improvements (joke2k#1350) * FW and JSON Improvements * Update assertion to be more clear * Update CHANGELOG.md * Bump version: 5.2.0 → 5.3.0 * da_DK address provider (joke2k#1353) * Add dk_DK address provider * Add tests for adress provider dk_DK * Modified tests and changed source for postcodes * Fix capitalization error * Fix locale code from dk_DK to da_DK * Update CHANGELOG.md * Bump version: 5.3.0 → 5.4.0 * Remove empty string from `ar_AA` Person Provider (joke2k#1364) * Update CHANGELOG.md * Bump version: 5.4.0 → 5.4.1 Co-authored-by: Theo Sinnige <72202732+TheoSinnige@users.noreply.github.com> Co-authored-by: fcurella <flavio.curella@gmail.com> Co-authored-by: legiahoang <giahoangth@gmail.com> Co-authored-by: Pulkit Gupta <42977200+pulkitgupta2k@users.noreply.github.com> Co-authored-by: johnbrandborg <john.brandborg@protonmail.com> Co-authored-by: JoseNavy <72418396+JoseNavy@users.noreply.github.com> Co-authored-by: Flavio Curella <89607+fcurella@users.noreply.github.com>
n1ngu · Jun 2, 2021 · 8415089 · 8415089
1 parent 5a00b96
commit 8415089
Show file tree

Hide file tree

Showing 7 changed files with 110 additions and 17 deletions.
diff --git a/README.rst b/README.rst
@@ -165,6 +165,15 @@ providers package. The localization of Faker is an ongoing process, for
 which we need your help. Please don't hesitate to create a localized
 provider for your own locale and submit a Pull Request (PR).
 
+Optimizations
+-------------
+The Faker constructor takes a performance-related argument called
+``use_weighting``. It specifies whether to attempt to have the frequency
+of values match real-world frequencies (e.g. the English name Gary would
+be much more frequent than the name Lorimer). If ``use_weighting`` is ``False``,
+then all items have an equal chance of being selected, and the selection
+process is much faster. The default is ``True``.
+
 Command line usage
 ------------------
 

diff --git a/faker/factory.py b/faker/factory.py
@@ -27,6 +27,9 @@ def create(
             providers=None,
             generator=None,
             includes=None,
+            # Should we use weightings (more realistic) or weight every element equally (faster)?
+            # By default, use weightings for backwards compatibility & realism
+            use_weighting=True,
             **config):
         if includes is None:
             includes = []
@@ -39,6 +42,7 @@ def create(
             raise AttributeError(msg)
 
         config['locale'] = locale
+        config['use_weighting'] = use_weighting
         providers = providers or PROVIDERS
 
         providers += includes
@@ -51,6 +55,7 @@ def create(
 
             prov_cls, lang_found = cls._get_provider_class(prov_name, locale)
             provider = prov_cls(faker)
+            provider.__use_weighting__ = use_weighting
             provider.__provider__ = prov_name
             provider.__lang__ = lang_found
             faker.add_provider(provider)

diff --git a/faker/providers/__init__.py b/faker/providers/__init__.py
@@ -17,6 +17,7 @@ class BaseProvider:
 
     __provider__ = 'base'
     __lang__ = None
+    __use_weighting__ = False
 
     # Locales supported by Linux Mint from `/usr/share/i18n/SUPPORTED`
     language_locale_codes = {
@@ -215,7 +216,8 @@ def random_uppercase_letter(self):
         """
         return self.generator.random.choice(string.ascii_uppercase)
 
-    def random_elements(self, elements=('a', 'b', 'c'), length=None, unique=False):
+    def random_elements(self, elements=('a', 'b', 'c'), length=None, unique=False,
+                        use_weighting=None):
         """Generate a list of randomly sampled objects from ``elements``.
 
         Set ``unique`` to ``False`` for random sampling with replacement, and set ``unique`` to
@@ -271,6 +273,10 @@ def random_elements(self, elements=('a', 'b', 'c'), length=None, unique=False):
                        ("d", 0.05),
                    ]), unique=True
         """
+        use_weighting = (use_weighting
+                         if use_weighting is not None
+                         else self.__use_weighting__)
+
         if isinstance(elements, dict) and not isinstance(elements, OrderedDict):
             raise ValueError("Use OrderedDict only to avoid dependency on PYTHONHASHSEED (See #363).")
 
@@ -284,18 +290,21 @@ def random_elements(self, elements=('a', 'b', 'c'), length=None, unique=False):
                 "Sample length cannot be longer than the number of unique elements to pick from.")
 
         if isinstance(elements, dict):
-            choices = elements.keys()
-            probabilities = elements.values()
+            if not hasattr(elements, "_key_cache"):
+                elements._key_cache = tuple(elements.keys())
+
+            choices = elements._key_cache
+            probabilities = tuple(elements.values()) if use_weighting else None
         else:
             if unique:
                 # shortcut
                 return self.generator.random.sample(elements, length)
             choices = elements
-            probabilities = [1.0 for _ in range(len(choices))]
+            probabilities = None
 
         return fn(
-            list(choices),
-            list(probabilities),
+            tuple(choices),
+            probabilities,
             self.generator.random,
             length=length,
         )

diff --git a/faker/proxy.py b/faker/proxy.py
@@ -24,7 +24,8 @@ class Faker:
     ]
 
     def __init__(self, locale=None, providers=None,
-                 generator=None, includes=None, **config):
+                 generator=None, includes=None,
+                 use_weighting=True, **config):
         self._factory_map = OrderedDict()
         self._weights = None
         self._unique_proxy = UniqueProxy(self)
@@ -56,7 +57,9 @@ def __init__(self, locale=None, providers=None,
             locales = [DEFAULT_LOCALE]
 
         for locale in locales:
-            self._factory_map[locale] = Factory.create(locale, providers, generator, includes, **config)
+            self._factory_map[locale] = Factory.create(locale, providers, generator, includes,
+                                                       use_weighting=use_weighting,
+                                                       **config)
 
         self._locales = locales
         self._factories = list(self._factory_map.values())

diff --git a/faker/utils/distribution.py b/faker/utils/distribution.py
@@ -1,7 +1,8 @@
 import bisect
+import itertools
 
 from random import Random
-from typing import Generator, Iterable, List, Optional, TypeVar
+from typing import Generator, Iterable, Optional, Sequence, TypeVar
 
 from faker.generator import random as mod_random
 
@@ -23,8 +24,8 @@ def cumsum(it: Iterable[float]) -> Generator[float, None, None]:
 
 
 def choices_distribution_unique(
-        a: List[T], p: List[float], random: Optional[Random] = None, length: int = 1,
-) -> List[T]:
+        a: Sequence[T], p: Sequence[float], random: Optional[Random] = None, length: int = 1,
+) -> Sequence[T]:
     # As of Python 3.7, there isn't a way to sample unique elements that takes
     # weight into account.
     if random is None:
@@ -37,7 +38,7 @@ def choices_distribution_unique(
     items = list(a)
     probabilities = list(p)
     for i in range(length):
-        cdf = list(cumsum(probabilities))
+        cdf = tuple(cumsum(probabilities))
         normal = cdf[-1]
         cdf2 = [float(i) / float(normal) for i in cdf]
         uniform_sample = random_sample(random=random)
@@ -49,18 +50,26 @@ def choices_distribution_unique(
     return choices
 
 
-def choices_distribution(a: List[T], p: List[float], random: Optional[Random] = None, length: int = 1) -> List[T]:
+def choices_distribution(
+    a: Sequence[T], p: Sequence[float], random: Optional[Random] = None, length: int = 1,
+) -> Sequence[T]:
     if random is None:
         random = mod_random
 
-    assert len(a) == len(p)
+    if p is not None:
+        assert len(a) == len(p)
 
     if hasattr(random, 'choices'):
-        choices = random.choices(a, weights=p, k=length)
-        return choices
+        if length == 1 and p is None:
+            return (random.choice(a),)
+        else:
+            return random.choices(a, weights=p, k=length)
     else:
         choices = []
 
+        if p is None:
+            p = itertools.repeat(1, len(a))
+
         cdf = list(cumsum(p))
         normal = cdf[-1]
         cdf2 = [float(i) / float(normal) for i in cdf]

diff --git a/tests/providers/test_internet.py b/tests/providers/test_internet.py
@@ -236,7 +236,7 @@ def test_ipv4_distribution_selection(self):
         list_of_invalid_weights = [
             [1, 2, 3],   # List size does not match subnet list size
             ['a', 'b'],  # List size matches, but elements are invalid
-            None,        # Not a list or valid iterable
+            11,        # Not a list or valid iterable
         ]
 
         with patch('faker.providers.internet.choices_distribution',

diff --git a/tests/test_proxy.py b/tests/test_proxy.py
@@ -1,3 +1,5 @@
+import random
+
 from collections import OrderedDict
 from unittest.mock import PropertyMock, patch
 
@@ -318,6 +320,62 @@ def test_multiple_locale_factory_selection_unsupported_method(self):
         with pytest.raises(AttributeError):
             fake.obviously_invalid_provider_method_a23f()
 
+    @patch('random.Random.choice')
+    @patch('random.Random.choices')
+    def test_weighting_disabled_single_choice(self, mock_choices_fn, mock_choice_fn):
+        fake = Faker(use_weighting=False)
+        fake.first_name()
+        mock_choice_fn.assert_called()
+        mock_choices_fn.assert_not_called()
+
+    @patch('random.Random.choice')
+    @patch('random.Random.choices', wraps=random.Random().choices)
+    def test_weighting_disabled_with_locales(self, mock_choices_fn, mock_choice_fn):
+        locale = OrderedDict([
+            ('de_DE', 3),
+            ('en-US', 2),
+            ('en-PH', 1),
+            ('ja_JP', 5),
+        ])
+        fake = Faker(locale, use_weighting=False)
+        fake.first_name()
+        mock_choices_fn.assert_called()  # select provider
+        mock_choice_fn.assert_called()   # select within provider
+
+    @patch('random.Random.choice')
+    @patch('random.Random.choices', wraps=random.Random().choices)
+    def test_weighting_disabled_multiple_locales(self, mock_choices_fn, mock_choice_fn):
+        locale = OrderedDict([
+            ('de_DE', 3),
+            ('en-US', 2),
+            ('en-PH', 1),
+            ('ja_JP', 5),
+        ])
+        fake = Faker(locale, use_weighting=False)
+        fake.first_name()
+        mock_choices_fn.assert_called()  # select provider
+        mock_choice_fn.assert_called()   # select within provider
+
+    @patch('random.Random.choice')
+    @patch('random.Random.choices', wraps=random.Random().choices)
+    def test_weighting_disabled_multiple_choices(self, mock_choices_fn, mock_choice_fn):
+        fake = Faker(use_weighting=False)
+        fake.uri_path(deep=3)
+
+        assert mock_choices_fn.mock_calls[0][2]["k"] == 3
+        assert mock_choices_fn.mock_calls[0][2]["weights"] is None
+        mock_choice_fn.assert_not_called()
+
+    @patch('random.Random.choice')
+    @patch('random.Random.choices', wraps=random.Random().choices)
+    def test_weighting_enabled_multiple_choices(self, mock_choices_fn, mock_choice_fn):
+        fake = Faker(use_weighting=True)
+        fake.uri_path(deep=3)
+
+        assert mock_choices_fn.mock_calls[0][2]["k"] == 3
+        assert mock_choices_fn.mock_calls[0][2]["weights"] is None
+        mock_choice_fn.assert_not_called()
+
     def test_dir_include_all_providers_attribute_in_list(self):
         fake = Faker(['en_US', 'en_PH'])
         expected = set(dir(Faker) + [