Skip to content

Commit

Permalink
Merge branch 'develop' into f/example-branch-id-pk-blog
Browse files Browse the repository at this point in the history
* develop: (29 commits)
  [BUGFIX] pydantic>=1.10.4 - ImportError: cannot import name dataclass_transform (#7163)
  [MAINTENANCE] ZEP - update asset factories method signatures from asset models (#7096)
  Delete cli v012 tests. (#7159)
  [CONTRIB] added new Expectations  - India_zip_code expectation and not_to_be_future_date expectation (#6086)
  [MAINTENANCE] Remove unused dockerfile (#7152)
  [DOCS] doc-464 consolidating and standardizing snippets (#7154)
  [BUGFIX] Patch broken rendered content Cloud tests (#7155)
  [MAINTENANCE] Clean up `mypy` violations in `CardinalityChecker` (#7146)
  [MAINTENANCE] Clean up pathlib.Path() usage in DataConnector utilities and restore tighter formatting in great_expectations/util.py  (#7149)
  [MAINTENANCE] Change all instances of `create_expectation_suite` to `add_expectation_suite` in tests, docs, and source code (#7117)
  [BUGFIX] Parse pandas version correctly for development builds (#7147)
  [MAINTENANCE] Update V3 DataConnector utilities to support New Datasources (ZEP) (#7144)
  [BUGFIX] Patch inconsistent ordering within GCP test asserts (#7130)
  Refactor sql splitter to take selectable instead of str. (#7133)
  [BUGFIX] `TupleAzureBlobStoreBackend` no longer gives warning when obfuscating connection string (#7139)
  [MAINTENANCE] ruff 0.0.246 update (#7137)
  [MAINTENANCE] Output Consistent Data Format from "table.head" Metric for every ExecutionEngine (#7134)
  [BUGFIX] Copy previous versions after checking out the the current commit (#7142)
  [DOCS] Remove sitemap.xml (#7141)
  [MAINTENANCE] mypy `v1.0.0` (#7138)
  ...
  • Loading branch information
Will Shin committed Feb 16, 2023
2 parents 3f5bae9 + 68752af commit eea6f68
Show file tree
Hide file tree
Showing 273 changed files with 4,424 additions and 14,250 deletions.
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ repos:
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
- id: black-jupyter
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: 'v0.0.241'
rev: 'v0.0.246'
hooks:
- id: ruff
files: ^(great_expectations|contrib) # TODO: add tests/ scripts/ etc.
files: ^(great_expectations|contrib|tasks\.py) # TODO: add tests/ scripts/ etc.
args: ["--fix"]
# https://pre-commit.ci/
ci:
Expand Down
6 changes: 3 additions & 3 deletions contrib/cli/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
black==22.3.0 # Linting / code style
black[jupyter]==22.3.0 # Linting / code style
Click>=7.1.2 # CLI tooling
cookiecutter==1.7.3 # Project templating
mypy==0.991 # Type checker
mypy==1.0.0 # Type checker
pydantic>=1.0,<2.0 # Needed for mypy plugin
pytest>=5.3.5 # Test framework
ruff==0.0.241 # Linting / code style
ruff==0.0.246 # Linting / code style
twine==3.7.1 # Packaging
wheel==0.37.1 # Packaging
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
"""
This is a template for creating custom ColumnMapExpectations.
For detailed instructions on how to use it, please see:
https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_map_expectations
"""

from typing import List, Optional

from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.execution_engine import PandasExecutionEngine
from great_expectations.expectations.expectation import ColumnMapExpectation
from great_expectations.expectations.metrics import (
ColumnMapMetricProvider,
column_condition_partial,
)


def are_values_after_split_in_value_set(
val: str, delimiter: str, value_set: List[str]
) -> bool:
all_split_values = [v.strip() for v in val.split(delimiter)]

for val in all_split_values:
if val not in value_set:
return False

return True


# This class defines a Metric to support your Expectation.
# For most ColumnMapExpectations, the main business logic for calculation will live in this class.
class ColumnValuesAfterSplitInSet(ColumnMapMetricProvider):

# This is the id string that will be used to reference your metric.
condition_metric_name = "column_values.values_after_split_in_set"
condition_value_keys = (
"delimiter",
"value_set",
)

# This method implements the core logic for the PandasExecutionEngine
@column_condition_partial(engine=PandasExecutionEngine)
def _pandas(cls, column, delimiter, value_set, **kwargs):
value_set = set(value_set)
return column.apply(
lambda x: are_values_after_split_in_value_set(x, delimiter, value_set)
)

# This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine
# @column_condition_partial(engine=SqlAlchemyExecutionEngine)
# def _sqlalchemy(cls, column, _dialect, **kwargs):
# raise NotImplementedError

# This method defines the business logic for evaluating your metric when using a SparkDFExecutionEngine
# @column_condition_partial(engine=SparkDFExecutionEngine)
# def _spark(cls, column, **kwargs):
# raise NotImplementedError


# This class defines the Expectation itself
class ExpectColumnValuesAfterSplitToBeInSet(ColumnMapExpectation):
"""Expect values in the column after splitting on a delimiter to be in a pre-defined set"""

# These examples will be shown in the public gallery.
# They will also be executed as unit tests for your Expectation.
examples = [
{
"data": {
"allowed_sports": [
"hockey,football",
"cricket,tennis",
"tennis,badminton,hockey",
],
"not_sports": ["cnn,hockey", "football", "badminton,judo,BBC"],
},
"tests": [
{
"title": "basic_positive_test",
"exact_match_out": False,
"include_in_gallery": True,
"in": {
"column": "allowed_sports",
"delimiter": ",",
"value_set": [
"hockey",
"football",
"tennis",
"badminton",
"cricket",
],
},
"out": {
"success": True,
},
},
{
"title": "basic_negative_test",
"exact_match_out": False,
"include_in_gallery": True,
"in": {
"column": "not_sports",
"delimiter": ",",
"value_set": [
"hockey",
"football",
"tennis",
"badminton",
"cricket",
],
},
"out": {
"success": False,
},
},
{
"title": "postive_test_with_mostly",
"exact_match_out": False,
"include_in_gallery": True,
"in": {
"column": "not_sports",
"delimiter": ",",
"value_set": [
"hockey",
"football",
"tennis",
"badminton",
"cricket",
],
"mostly": 0.33,
},
"out": {
"success": True,
},
},
],
}
]

# This is the id string of the Metric used by this Expectation.
# For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above.
map_metric = "column_values.values_after_split_in_set"

# This is a list of parameter names that can affect whether the Expectation evaluates to True or False
success_keys = ("mostly", "value_set", "delimiter")

# This dictionary contains default values for any parameters that should have default values
default_kwarg_values = {}

def validate_configuration(
self, configuration: Optional[ExpectationConfiguration] = None
) -> None:
"""
Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that
necessary configuration arguments have been provided for the validation of the expectation.
Args:
configuration (OPTIONAL[ExpectationConfiguration]): \
An optional Expectation Configuration entry that will be used to configure the expectation
Returns:
None. Raises InvalidExpectationConfigurationError if the config is not validated successfully
"""

super().validate_configuration(configuration)
configuration = configuration or self.configuration

# # Check other things in configuration.kwargs and raise Exceptions if needed
# try:
# assert (
# ...
# ), "message"
# assert (
# ...
# ), "message"
# except AssertionError as e:
# raise InvalidExpectationConfigurationError(str(e))

# This object contains metadata for display in the public Gallery
library_metadata = {
"tags": ["pandas"], # Tags for this Expectation in the Gallery
"contributors": [ # Github handles for all contributors to this Expectation.
"@ace-racer", # Don't forget to add your github handle here!
],
}


if __name__ == "__main__":
ExpectColumnValuesAfterSplitToBeInSet().print_diagnostic_checklist()
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""
This is a template for creating custom ColumnMapExpectations.
For detailed instructions on how to use it, please see:
https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_map_expectations
"""

from typing import Optional

from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.execution_engine import PandasExecutionEngine
from great_expectations.expectations.expectation import ColumnMapExpectation
from great_expectations.expectations.metrics import (
ColumnMapMetricProvider,
column_condition_partial,
)


def are_values_after_split_unique(val: str, delimiter: str) -> bool:
all_split_values = val.split(delimiter)
unique_split_values = set(all_split_values)
return len(all_split_values) == len(unique_split_values)


# This class defines a Metric to support your Expectation.
# For most ColumnMapExpectations, the main business logic for calculation will live in this class.
class ColumnValuesAfterSplitAreUnique(ColumnMapMetricProvider):

# This is the id string that will be used to reference your metric.
condition_metric_name = "column_values.values_after_split_are_unique"
condition_value_keys = ("delimiter",)

# This method implements the core logic for the PandasExecutionEngine
@column_condition_partial(engine=PandasExecutionEngine)
def _pandas(cls, column, **kwargs):
delimiter = kwargs.get("delimiter", ",")
return column.apply(lambda x: are_values_after_split_unique(x, delimiter))

# This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine
# @column_condition_partial(engine=SqlAlchemyExecutionEngine)
# def _sqlalchemy(cls, column, _dialect, **kwargs):
# raise NotImplementedError

# This method defines the business logic for evaluating your metric when using a SparkDFExecutionEngine
# @column_condition_partial(engine=SparkDFExecutionEngine)
# def _spark(cls, column, **kwargs):
# raise NotImplementedError


# This class defines the Expectation itself
class ExpectColumnValuesAfterSplitToBeUnique(ColumnMapExpectation):
"""Expect values in the column after splitting on a delimiter to be unique"""

# These examples will be shown in the public gallery.
# They will also be executed as unit tests for your Expectation.
examples = [
{
"data": {
"unique_sports": [
"hockey,football",
"cricket",
"tennis,badminton,hockey",
],
"duplicate_sports": [
"foosball,hockey",
"football,football",
"badminton,judo",
],
},
"tests": [
{
"title": "basic_positive_test",
"exact_match_out": False,
"include_in_gallery": True,
"in": {"column": "unique_sports", "delimiter": ","},
"out": {
"success": True,
},
},
{
"title": "basic_negative_test",
"exact_match_out": False,
"include_in_gallery": True,
"in": {"column": "duplicate_sports", "delimiter": ","},
"out": {
"success": False,
},
},
{
"title": "positive_test_with_mostly",
"exact_match_out": False,
"include_in_gallery": True,
"in": {
"column": "duplicate_sports",
"delimiter": ",",
"mostly": 0.66,
},
"out": {
"success": True,
},
},
],
}
]

# This is the id string of the Metric used by this Expectation.
# For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above.
map_metric = "column_values.values_after_split_are_unique"

# This is a list of parameter names that can affect whether the Expectation evaluates to True or False
success_keys = (
"mostly",
"delimiter",
)

# This dictionary contains default values for any parameters that should have default values
default_kwarg_values = {}

def validate_configuration(
self, configuration: Optional[ExpectationConfiguration] = None
) -> None:
"""
Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that
necessary configuration arguments have been provided for the validation of the expectation.
Args:
configuration (OPTIONAL[ExpectationConfiguration]): \
An optional Expectation Configuration entry that will be used to configure the expectation
Returns:
None. Raises InvalidExpectationConfigurationError if the config is not validated successfully
"""

super().validate_configuration(configuration)
configuration = configuration or self.configuration

# # Check other things in configuration.kwargs and raise Exceptions if needed
# try:
# assert (
# ...
# ), "message"
# assert (
# ...
# ), "message"
# except AssertionError as e:
# raise InvalidExpectationConfigurationError(str(e))

# This object contains metadata for display in the public Gallery
library_metadata = {
"tags": ["pandas"], # Tags for this Expectation in the Gallery
"contributors": [ # Github handles for all contributors to this Expectation.
"@ace-racer", # Don't forget to add your github handle here!
],
}


if __name__ == "__main__":
ExpectColumnValuesAfterSplitToBeUnique().print_diagnostic_checklist()

0 comments on commit eea6f68

Please sign in to comment.