Merge branch 'develop' into f/example-branch-id-pk-blog

* develop: (29 commits) [BUGFIX] pydantic>=1.10.4 - ImportError: cannot import name dataclass_transform (#7163) [MAINTENANCE] ZEP - update asset factories method signatures from asset models (#7096) Delete cli v012 tests. (#7159) [CONTRIB] added new Expectations - India_zip_code expectation and not_to_be_future_date expectation (#6086) [MAINTENANCE] Remove unused dockerfile (#7152) [DOCS] doc-464 consolidating and standardizing snippets (#7154) [BUGFIX] Patch broken rendered content Cloud tests (#7155) [MAINTENANCE] Clean up `mypy` violations in `CardinalityChecker` (#7146) [MAINTENANCE] Clean up pathlib.Path() usage in DataConnector utilities and restore tighter formatting in great_expectations/util.py (#7149) [MAINTENANCE] Change all instances of `create_expectation_suite` to `add_expectation_suite` in tests, docs, and source code (#7117) [BUGFIX] Parse pandas version correctly for development builds (#7147) [MAINTENANCE] Update V3 DataConnector utilities to support New Datasources (ZEP) (#7144) [BUGFIX] Patch inconsistent ordering within GCP test asserts (#7130) Refactor sql splitter to take selectable instead of str. (#7133) [BUGFIX] `TupleAzureBlobStoreBackend` no longer gives warning when obfuscating connection string (#7139) [MAINTENANCE] ruff 0.0.246 update (#7137) [MAINTENANCE] Output Consistent Data Format from "table.head" Metric for every ExecutionEngine (#7134) [BUGFIX] Copy previous versions after checking out the the current commit (#7142) [DOCS] Remove sitemap.xml (#7141) [MAINTENANCE] mypy `v1.0.0` (#7138) ...
great-expectations · Feb 16, 2023 · eea6f68 · eea6f68
2 parents 3f5bae9 + 68752af
commit eea6f68
Show file tree

Hide file tree

Showing 273 changed files with 4,424 additions and 14,250 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,12 +15,12 @@ repos:
   - repo: https://github.com/psf/black
     rev: 22.3.0
     hooks:
-      - id: black
+      - id: black-jupyter
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: 'v0.0.241'
+    rev: 'v0.0.246'
     hooks:
       - id: ruff
-        files: ^(great_expectations|contrib) # TODO: add tests/ scripts/ etc.
+        files: ^(great_expectations|contrib|tasks\.py) # TODO: add tests/ scripts/ etc.
         args: ["--fix"]
 # https://pre-commit.ci/
 ci:

diff --git a/contrib/cli/requirements.txt b/contrib/cli/requirements.txt
@@ -1,9 +1,9 @@
-black==22.3.0        # Linting / code style
+black[jupyter]==22.3.0 # Linting / code style
 Click>=7.1.2         # CLI tooling
 cookiecutter==1.7.3  # Project templating
-mypy==0.991          # Type checker
+mypy==1.0.0          # Type checker
 pydantic>=1.0,<2.0    # Needed for mypy plugin
 pytest>=5.3.5        # Test framework
-ruff==0.0.241        # Linting / code style
+ruff==0.0.246        # Linting / code style
 twine==3.7.1         # Packaging
 wheel==0.37.1        # Packaging
diff --git a/...t_expectations_experimental/expectations/expect_column_values_after_split_to_be_in_set.py b/...t_expectations_experimental/expectations/expect_column_values_after_split_to_be_in_set.py
@@ -0,0 +1,187 @@
+"""
+This is a template for creating custom ColumnMapExpectations.
+For detailed instructions on how to use it, please see:
+    https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_map_expectations
+"""
+
+from typing import List, Optional
+
+from great_expectations.core.expectation_configuration import ExpectationConfiguration
+from great_expectations.execution_engine import PandasExecutionEngine
+from great_expectations.expectations.expectation import ColumnMapExpectation
+from great_expectations.expectations.metrics import (
+    ColumnMapMetricProvider,
+    column_condition_partial,
+)
+
+
+def are_values_after_split_in_value_set(
+    val: str, delimiter: str, value_set: List[str]
+) -> bool:
+    all_split_values = [v.strip() for v in val.split(delimiter)]
+
+    for val in all_split_values:
+        if val not in value_set:
+            return False
+
+    return True
+
+
+# This class defines a Metric to support your Expectation.
+# For most ColumnMapExpectations, the main business logic for calculation will live in this class.
+class ColumnValuesAfterSplitInSet(ColumnMapMetricProvider):
+
+    # This is the id string that will be used to reference your metric.
+    condition_metric_name = "column_values.values_after_split_in_set"
+    condition_value_keys = (
+        "delimiter",
+        "value_set",
+    )
+
+    # This method implements the core logic for the PandasExecutionEngine
+    @column_condition_partial(engine=PandasExecutionEngine)
+    def _pandas(cls, column, delimiter, value_set, **kwargs):
+        value_set = set(value_set)
+        return column.apply(
+            lambda x: are_values_after_split_in_value_set(x, delimiter, value_set)
+        )
+
+    # This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine
+    # @column_condition_partial(engine=SqlAlchemyExecutionEngine)
+    # def _sqlalchemy(cls, column, _dialect, **kwargs):
+    #     raise NotImplementedError
+
+    # This method defines the business logic for evaluating your metric when using a SparkDFExecutionEngine
+    # @column_condition_partial(engine=SparkDFExecutionEngine)
+    # def _spark(cls, column, **kwargs):
+    #     raise NotImplementedError
+
+
+# This class defines the Expectation itself
+class ExpectColumnValuesAfterSplitToBeInSet(ColumnMapExpectation):
+    """Expect values in the column after splitting on a delimiter to be in a pre-defined set"""
+
+    # These examples will be shown in the public gallery.
+    # They will also be executed as unit tests for your Expectation.
+    examples = [
+        {
+            "data": {
+                "allowed_sports": [
+                    "hockey,football",
+                    "cricket,tennis",
+                    "tennis,badminton,hockey",
+                ],
+                "not_sports": ["cnn,hockey", "football", "badminton,judo,BBC"],
+            },
+            "tests": [
+                {
+                    "title": "basic_positive_test",
+                    "exact_match_out": False,
+                    "include_in_gallery": True,
+                    "in": {
+                        "column": "allowed_sports",
+                        "delimiter": ",",
+                        "value_set": [
+                            "hockey",
+                            "football",
+                            "tennis",
+                            "badminton",
+                            "cricket",
+                        ],
+                    },
+                    "out": {
+                        "success": True,
+                    },
+                },
+                {
+                    "title": "basic_negative_test",
+                    "exact_match_out": False,
+                    "include_in_gallery": True,
+                    "in": {
+                        "column": "not_sports",
+                        "delimiter": ",",
+                        "value_set": [
+                            "hockey",
+                            "football",
+                            "tennis",
+                            "badminton",
+                            "cricket",
+                        ],
+                    },
+                    "out": {
+                        "success": False,
+                    },
+                },
+                {
+                    "title": "postive_test_with_mostly",
+                    "exact_match_out": False,
+                    "include_in_gallery": True,
+                    "in": {
+                        "column": "not_sports",
+                        "delimiter": ",",
+                        "value_set": [
+                            "hockey",
+                            "football",
+                            "tennis",
+                            "badminton",
+                            "cricket",
+                        ],
+                        "mostly": 0.33,
+                    },
+                    "out": {
+                        "success": True,
+                    },
+                },
+            ],
+        }
+    ]
+
+    # This is the id string of the Metric used by this Expectation.
+    # For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above.
+    map_metric = "column_values.values_after_split_in_set"
+
+    # This is a list of parameter names that can affect whether the Expectation evaluates to True or False
+    success_keys = ("mostly", "value_set", "delimiter")
+
+    # This dictionary contains default values for any parameters that should have default values
+    default_kwarg_values = {}
+
+    def validate_configuration(
+        self, configuration: Optional[ExpectationConfiguration] = None
+    ) -> None:
+        """
+        Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that
+        necessary configuration arguments have been provided for the validation of the expectation.
+
+        Args:
+            configuration (OPTIONAL[ExpectationConfiguration]): \
+                An optional Expectation Configuration entry that will be used to configure the expectation
+        Returns:
+            None. Raises InvalidExpectationConfigurationError if the config is not validated successfully
+        """
+
+        super().validate_configuration(configuration)
+        configuration = configuration or self.configuration
+
+        # # Check other things in configuration.kwargs and raise Exceptions if needed
+        # try:
+        #     assert (
+        #         ...
+        #     ), "message"
+        #     assert (
+        #         ...
+        #     ), "message"
+        # except AssertionError as e:
+        #     raise InvalidExpectationConfigurationError(str(e))
+
+    # This object contains metadata for display in the public Gallery
+    library_metadata = {
+        "tags": ["pandas"],  # Tags for this Expectation in the Gallery
+        "contributors": [  # Github handles for all contributors to this Expectation.
+            "@ace-racer",  # Don't forget to add your github handle here!
+        ],
+    }
+
+
+if __name__ == "__main__":
+    ExpectColumnValuesAfterSplitToBeInSet().print_diagnostic_checklist()
diff --git a/...t_expectations_experimental/expectations/expect_column_values_after_split_to_be_unique.py b/...t_expectations_experimental/expectations/expect_column_values_after_split_to_be_unique.py
@@ -0,0 +1,156 @@
+"""
+This is a template for creating custom ColumnMapExpectations.
+For detailed instructions on how to use it, please see:
+    https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_map_expectations
+"""
+
+from typing import Optional
+
+from great_expectations.core.expectation_configuration import ExpectationConfiguration
+from great_expectations.execution_engine import PandasExecutionEngine
+from great_expectations.expectations.expectation import ColumnMapExpectation
+from great_expectations.expectations.metrics import (
+    ColumnMapMetricProvider,
+    column_condition_partial,
+)
+
+
+def are_values_after_split_unique(val: str, delimiter: str) -> bool:
+    all_split_values = val.split(delimiter)
+    unique_split_values = set(all_split_values)
+    return len(all_split_values) == len(unique_split_values)
+
+
+# This class defines a Metric to support your Expectation.
+# For most ColumnMapExpectations, the main business logic for calculation will live in this class.
+class ColumnValuesAfterSplitAreUnique(ColumnMapMetricProvider):
+
+    # This is the id string that will be used to reference your metric.
+    condition_metric_name = "column_values.values_after_split_are_unique"
+    condition_value_keys = ("delimiter",)
+
+    # This method implements the core logic for the PandasExecutionEngine
+    @column_condition_partial(engine=PandasExecutionEngine)
+    def _pandas(cls, column, **kwargs):
+        delimiter = kwargs.get("delimiter", ",")
+        return column.apply(lambda x: are_values_after_split_unique(x, delimiter))
+
+    # This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine
+    # @column_condition_partial(engine=SqlAlchemyExecutionEngine)
+    # def _sqlalchemy(cls, column, _dialect, **kwargs):
+    #     raise NotImplementedError
+
+    # This method defines the business logic for evaluating your metric when using a SparkDFExecutionEngine
+    # @column_condition_partial(engine=SparkDFExecutionEngine)
+    # def _spark(cls, column, **kwargs):
+    #     raise NotImplementedError
+
+
+# This class defines the Expectation itself
+class ExpectColumnValuesAfterSplitToBeUnique(ColumnMapExpectation):
+    """Expect values in the column after splitting on a delimiter to be unique"""
+
+    # These examples will be shown in the public gallery.
+    # They will also be executed as unit tests for your Expectation.
+    examples = [
+        {
+            "data": {
+                "unique_sports": [
+                    "hockey,football",
+                    "cricket",
+                    "tennis,badminton,hockey",
+                ],
+                "duplicate_sports": [
+                    "foosball,hockey",
+                    "football,football",
+                    "badminton,judo",
+                ],
+            },
+            "tests": [
+                {
+                    "title": "basic_positive_test",
+                    "exact_match_out": False,
+                    "include_in_gallery": True,
+                    "in": {"column": "unique_sports", "delimiter": ","},
+                    "out": {
+                        "success": True,
+                    },
+                },
+                {
+                    "title": "basic_negative_test",
+                    "exact_match_out": False,
+                    "include_in_gallery": True,
+                    "in": {"column": "duplicate_sports", "delimiter": ","},
+                    "out": {
+                        "success": False,
+                    },
+                },
+                {
+                    "title": "positive_test_with_mostly",
+                    "exact_match_out": False,
+                    "include_in_gallery": True,
+                    "in": {
+                        "column": "duplicate_sports",
+                        "delimiter": ",",
+                        "mostly": 0.66,
+                    },
+                    "out": {
+                        "success": True,
+                    },
+                },
+            ],
+        }
+    ]
+
+    # This is the id string of the Metric used by this Expectation.
+    # For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above.
+    map_metric = "column_values.values_after_split_are_unique"
+
+    # This is a list of parameter names that can affect whether the Expectation evaluates to True or False
+    success_keys = (
+        "mostly",
+        "delimiter",
+    )
+
+    # This dictionary contains default values for any parameters that should have default values
+    default_kwarg_values = {}
+
+    def validate_configuration(
+        self, configuration: Optional[ExpectationConfiguration] = None
+    ) -> None:
+        """
+        Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that
+        necessary configuration arguments have been provided for the validation of the expectation.
+
+        Args:
+            configuration (OPTIONAL[ExpectationConfiguration]): \
+                An optional Expectation Configuration entry that will be used to configure the expectation
+        Returns:
+            None. Raises InvalidExpectationConfigurationError if the config is not validated successfully
+        """
+
+        super().validate_configuration(configuration)
+        configuration = configuration or self.configuration
+
+        # # Check other things in configuration.kwargs and raise Exceptions if needed
+        # try:
+        #     assert (
+        #         ...
+        #     ), "message"
+        #     assert (
+        #         ...
+        #     ), "message"
+        # except AssertionError as e:
+        #     raise InvalidExpectationConfigurationError(str(e))
+
+    # This object contains metadata for display in the public Gallery
+    library_metadata = {
+        "tags": ["pandas"],  # Tags for this Expectation in the Gallery
+        "contributors": [  # Github handles for all contributors to this Expectation.
+            "@ace-racer",  # Don't forget to add your github handle here!
+        ],
+    }
+
+
+if __name__ == "__main__":
+    ExpectColumnValuesAfterSplitToBeUnique().print_diagnostic_checklist()