Skip to content

Commit

Permalink
Faker optimizations as described in README.rst (joke2k#1348)
Browse files Browse the repository at this point in the history
* Add micro-benchmark to make it easy to detect regressions.

* Faker optimizations as described in README.rst .

* Remove cruft

* Flake8, bugfix

* Remove use_internal_caches
Use tuples instead of lists in several places
Move fakebench so it doesn't run during tests
Unit tests

* Include benchmark

* Improve arg naming

* Tighten up type

* Remove benchmark

* Revert "Improve arg naming"

This reverts commit d867c31.

* Remove invalid Dutch surname (joke2k#1347) (joke2k#1349)

* Update CHANGELOG.md

* Bump version: 5.0.1 → 5.0.2

* fix: typo faker_locale -> faker_seed in pytest-fixtures (joke2k#1351)

* en_IN person provider (joke2k#1355)

* en_IN person provider

* en_IN person provider linted

* en_IN person providers removed formats

* Update CHANGELOG.md

* Bump version: 5.0.2 → 5.1.0

* fix typo in changelog

* en_IN address + phone number provider (joke2k#1357)

* en_IN person provider

* en_IN person provider linted

* en_IN person providers removed formats

* en_IN address+phone_number added

* test_address linted

* Update CHANGELOG.md

* Bump version: 5.1.0 → 5.2.0

* FW and JSON Improvements (joke2k#1350)

* FW and JSON Improvements

* Update assertion to be more clear

* Update CHANGELOG.md

* Bump version: 5.2.0 → 5.3.0

* da_DK address provider (joke2k#1353)

* Add dk_DK address provider

* Add tests for adress provider dk_DK

* Modified tests and changed source for postcodes

* Fix capitalization error

* Fix locale code from dk_DK to da_DK

* Update CHANGELOG.md

* Bump version: 5.3.0 → 5.4.0

* Remove empty string from  `ar_AA` Person Provider (joke2k#1364)

* Update CHANGELOG.md

* Bump version: 5.4.0 → 5.4.1

Co-authored-by: Theo Sinnige <72202732+TheoSinnige@users.noreply.github.com>
Co-authored-by: fcurella <flavio.curella@gmail.com>
Co-authored-by: legiahoang <giahoangth@gmail.com>
Co-authored-by: Pulkit Gupta <42977200+pulkitgupta2k@users.noreply.github.com>
Co-authored-by: johnbrandborg <john.brandborg@protonmail.com>
Co-authored-by: JoseNavy <72418396+JoseNavy@users.noreply.github.com>
Co-authored-by: Flavio Curella <89607+fcurella@users.noreply.github.com>
  • Loading branch information
8 people committed Jun 2, 2021
1 parent 5a00b96 commit 8415089
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 17 deletions.
9 changes: 9 additions & 0 deletions README.rst
Expand Up @@ -165,6 +165,15 @@ providers package. The localization of Faker is an ongoing process, for
which we need your help. Please don't hesitate to create a localized
provider for your own locale and submit a Pull Request (PR).

Optimizations
-------------
The Faker constructor takes a performance-related argument called
``use_weighting``. It specifies whether to attempt to have the frequency
of values match real-world frequencies (e.g. the English name Gary would
be much more frequent than the name Lorimer). If ``use_weighting`` is ``False``,
then all items have an equal chance of being selected, and the selection
process is much faster. The default is ``True``.

Command line usage
------------------

Expand Down
5 changes: 5 additions & 0 deletions faker/factory.py
Expand Up @@ -27,6 +27,9 @@ def create(
providers=None,
generator=None,
includes=None,
# Should we use weightings (more realistic) or weight every element equally (faster)?
# By default, use weightings for backwards compatibility & realism
use_weighting=True,
**config):
if includes is None:
includes = []
Expand All @@ -39,6 +42,7 @@ def create(
raise AttributeError(msg)

config['locale'] = locale
config['use_weighting'] = use_weighting
providers = providers or PROVIDERS

providers += includes
Expand All @@ -51,6 +55,7 @@ def create(

prov_cls, lang_found = cls._get_provider_class(prov_name, locale)
provider = prov_cls(faker)
provider.__use_weighting__ = use_weighting
provider.__provider__ = prov_name
provider.__lang__ = lang_found
faker.add_provider(provider)
Expand Down
21 changes: 15 additions & 6 deletions faker/providers/__init__.py
Expand Up @@ -17,6 +17,7 @@ class BaseProvider:

__provider__ = 'base'
__lang__ = None
__use_weighting__ = False

# Locales supported by Linux Mint from `/usr/share/i18n/SUPPORTED`
language_locale_codes = {
Expand Down Expand Up @@ -215,7 +216,8 @@ def random_uppercase_letter(self):
"""
return self.generator.random.choice(string.ascii_uppercase)

def random_elements(self, elements=('a', 'b', 'c'), length=None, unique=False):
def random_elements(self, elements=('a', 'b', 'c'), length=None, unique=False,
use_weighting=None):
"""Generate a list of randomly sampled objects from ``elements``.
Set ``unique`` to ``False`` for random sampling with replacement, and set ``unique`` to
Expand Down Expand Up @@ -271,6 +273,10 @@ def random_elements(self, elements=('a', 'b', 'c'), length=None, unique=False):
("d", 0.05),
]), unique=True
"""
use_weighting = (use_weighting
if use_weighting is not None
else self.__use_weighting__)

if isinstance(elements, dict) and not isinstance(elements, OrderedDict):
raise ValueError("Use OrderedDict only to avoid dependency on PYTHONHASHSEED (See #363).")

Expand All @@ -284,18 +290,21 @@ def random_elements(self, elements=('a', 'b', 'c'), length=None, unique=False):
"Sample length cannot be longer than the number of unique elements to pick from.")

if isinstance(elements, dict):
choices = elements.keys()
probabilities = elements.values()
if not hasattr(elements, "_key_cache"):
elements._key_cache = tuple(elements.keys())

choices = elements._key_cache
probabilities = tuple(elements.values()) if use_weighting else None
else:
if unique:
# shortcut
return self.generator.random.sample(elements, length)
choices = elements
probabilities = [1.0 for _ in range(len(choices))]
probabilities = None

return fn(
list(choices),
list(probabilities),
tuple(choices),
probabilities,
self.generator.random,
length=length,
)
Expand Down
7 changes: 5 additions & 2 deletions faker/proxy.py
Expand Up @@ -24,7 +24,8 @@ class Faker:
]

def __init__(self, locale=None, providers=None,
generator=None, includes=None, **config):
generator=None, includes=None,
use_weighting=True, **config):
self._factory_map = OrderedDict()
self._weights = None
self._unique_proxy = UniqueProxy(self)
Expand Down Expand Up @@ -56,7 +57,9 @@ def __init__(self, locale=None, providers=None,
locales = [DEFAULT_LOCALE]

for locale in locales:
self._factory_map[locale] = Factory.create(locale, providers, generator, includes, **config)
self._factory_map[locale] = Factory.create(locale, providers, generator, includes,
use_weighting=use_weighting,
**config)

self._locales = locales
self._factories = list(self._factory_map.values())
Expand Down
25 changes: 17 additions & 8 deletions faker/utils/distribution.py
@@ -1,7 +1,8 @@
import bisect
import itertools

from random import Random
from typing import Generator, Iterable, List, Optional, TypeVar
from typing import Generator, Iterable, Optional, Sequence, TypeVar

from faker.generator import random as mod_random

Expand All @@ -23,8 +24,8 @@ def cumsum(it: Iterable[float]) -> Generator[float, None, None]:


def choices_distribution_unique(
a: List[T], p: List[float], random: Optional[Random] = None, length: int = 1,
) -> List[T]:
a: Sequence[T], p: Sequence[float], random: Optional[Random] = None, length: int = 1,
) -> Sequence[T]:
# As of Python 3.7, there isn't a way to sample unique elements that takes
# weight into account.
if random is None:
Expand All @@ -37,7 +38,7 @@ def choices_distribution_unique(
items = list(a)
probabilities = list(p)
for i in range(length):
cdf = list(cumsum(probabilities))
cdf = tuple(cumsum(probabilities))
normal = cdf[-1]
cdf2 = [float(i) / float(normal) for i in cdf]
uniform_sample = random_sample(random=random)
Expand All @@ -49,18 +50,26 @@ def choices_distribution_unique(
return choices


def choices_distribution(a: List[T], p: List[float], random: Optional[Random] = None, length: int = 1) -> List[T]:
def choices_distribution(
a: Sequence[T], p: Sequence[float], random: Optional[Random] = None, length: int = 1,
) -> Sequence[T]:
if random is None:
random = mod_random

assert len(a) == len(p)
if p is not None:
assert len(a) == len(p)

if hasattr(random, 'choices'):
choices = random.choices(a, weights=p, k=length)
return choices
if length == 1 and p is None:
return (random.choice(a),)
else:
return random.choices(a, weights=p, k=length)
else:
choices = []

if p is None:
p = itertools.repeat(1, len(a))

cdf = list(cumsum(p))
normal = cdf[-1]
cdf2 = [float(i) / float(normal) for i in cdf]
Expand Down
2 changes: 1 addition & 1 deletion tests/providers/test_internet.py
Expand Up @@ -236,7 +236,7 @@ def test_ipv4_distribution_selection(self):
list_of_invalid_weights = [
[1, 2, 3], # List size does not match subnet list size
['a', 'b'], # List size matches, but elements are invalid
None, # Not a list or valid iterable
11, # Not a list or valid iterable
]

with patch('faker.providers.internet.choices_distribution',
Expand Down
58 changes: 58 additions & 0 deletions tests/test_proxy.py
@@ -1,3 +1,5 @@
import random

from collections import OrderedDict
from unittest.mock import PropertyMock, patch

Expand Down Expand Up @@ -318,6 +320,62 @@ def test_multiple_locale_factory_selection_unsupported_method(self):
with pytest.raises(AttributeError):
fake.obviously_invalid_provider_method_a23f()

@patch('random.Random.choice')
@patch('random.Random.choices')
def test_weighting_disabled_single_choice(self, mock_choices_fn, mock_choice_fn):
fake = Faker(use_weighting=False)
fake.first_name()
mock_choice_fn.assert_called()
mock_choices_fn.assert_not_called()

@patch('random.Random.choice')
@patch('random.Random.choices', wraps=random.Random().choices)
def test_weighting_disabled_with_locales(self, mock_choices_fn, mock_choice_fn):
locale = OrderedDict([
('de_DE', 3),
('en-US', 2),
('en-PH', 1),
('ja_JP', 5),
])
fake = Faker(locale, use_weighting=False)
fake.first_name()
mock_choices_fn.assert_called() # select provider
mock_choice_fn.assert_called() # select within provider

@patch('random.Random.choice')
@patch('random.Random.choices', wraps=random.Random().choices)
def test_weighting_disabled_multiple_locales(self, mock_choices_fn, mock_choice_fn):
locale = OrderedDict([
('de_DE', 3),
('en-US', 2),
('en-PH', 1),
('ja_JP', 5),
])
fake = Faker(locale, use_weighting=False)
fake.first_name()
mock_choices_fn.assert_called() # select provider
mock_choice_fn.assert_called() # select within provider

@patch('random.Random.choice')
@patch('random.Random.choices', wraps=random.Random().choices)
def test_weighting_disabled_multiple_choices(self, mock_choices_fn, mock_choice_fn):
fake = Faker(use_weighting=False)
fake.uri_path(deep=3)

assert mock_choices_fn.mock_calls[0][2]["k"] == 3
assert mock_choices_fn.mock_calls[0][2]["weights"] is None
mock_choice_fn.assert_not_called()

@patch('random.Random.choice')
@patch('random.Random.choices', wraps=random.Random().choices)
def test_weighting_enabled_multiple_choices(self, mock_choices_fn, mock_choice_fn):
fake = Faker(use_weighting=True)
fake.uri_path(deep=3)

assert mock_choices_fn.mock_calls[0][2]["k"] == 3
assert mock_choices_fn.mock_calls[0][2]["weights"] is None
mock_choice_fn.assert_not_called()

def test_dir_include_all_providers_attribute_in_list(self):
fake = Faker(['en_US', 'en_PH'])
expected = set(dir(Faker) + [
Expand Down

0 comments on commit 8415089

Please sign in to comment.