Progress bar for refutation tests (#567)

* fixed warnings and progrss bar improvements Fixed the warnings arising in propensity score estimators and added optional progress bars for refuters Signed-off-by: Amey Varhade <ameyvarhade@gmail.com> * code struture improvements Signed-off-by: Amey Varhade <ameyvarhade@gmail.com> * removed vscode references * Added progress bar for dummy outcome refuter Signed-off-by: Amey Varhade <ameyvarhade@gmail.com> Signed-off-by: Amey Varhade <ameyvarhade@gmail.com> Co-authored-by: Amit Sharma <amit_sharma@live.com>
py-why · Aug 19, 2022 · f946386 · f946386
1 parent a38a03f
commit f946386
Show file tree

Hide file tree

Showing 9 changed files with 42 additions and 22 deletions.
diff --git a/docs/source/example_notebooks/dowhy_simple_example.ipynb b/docs/source/example_notebooks/dowhy_simple_example.ipynb
@@ -330,7 +330,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "res_random=model.refute_estimate(identified_estimand, estimate, method_name=\"random_common_cause\")\n",
+    "res_random=model.refute_estimate(identified_estimand, estimate, method_name=\"random_common_cause\", show_progress_bar=True)\n",
     "print(res_random)"
    ]
   },
@@ -348,7 +348,7 @@
    "outputs": [],
    "source": [
     "res_placebo=model.refute_estimate(identified_estimand, estimate,\n",
-    "        method_name=\"placebo_treatment_refuter\", placebo_type=\"permute\")\n",
+    "        method_name=\"placebo_treatment_refuter\", show_progress_bar=True, placebo_type=\"permute\")\n",
     "print(res_placebo)"
    ]
   },
@@ -366,7 +366,7 @@
    "outputs": [],
    "source": [
     "res_subset=model.refute_estimate(identified_estimand, estimate,\n",
-    "        method_name=\"data_subset_refuter\", subset_fraction=0.9)\n",
+    "        method_name=\"data_subset_refuter\", show_progress_bar=True, subset_fraction=0.9)\n",
     "print(res_subset)"
    ]
   },
@@ -388,7 +388,7 @@
    "outputs": [],
    "source": [
     "res_subset=model.refute_estimate(identified_estimand, estimate,\n",
-    "        method_name=\"data_subset_refuter\", subset_fraction=0.9, random_seed = 1, n_jobs=-1, verbose=10)\n",
+    "        method_name=\"data_subset_refuter\", show_progress_bar=True, subset_fraction=0.9, random_seed = 1, n_jobs=-1, verbose=10)\n",
     "print(res_subset)"
    ]
   },
@@ -488,7 +488,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.7.10 ('venvrl')",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -502,7 +502,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.10"
+   "version": "3.8.13"
   },
   "toc": {
    "base_numbering": 1,

diff --git a/dowhy/causal_estimators/propensity_score_estimator.py b/dowhy/causal_estimators/propensity_score_estimator.py
@@ -71,10 +71,15 @@ def __init__(self, *args, propensity_score_model=None,
             raise Exception(error_msg)
 
     def _refresh_propensity_score(self):
+        '''
+            A custom estimator based on the way the propensity score estimates are to be used.
+            Invoked from the '_estimate_effect' method of various propensity score subclasses when the propensity score is not pre-computed.      
+        '''
         if self.recalculate_propensity_score is True:
             if self.propensity_score_model is None:
                 self.propensity_score_model = linear_model.LogisticRegression()
-            self.propensity_score_model.fit(self._observed_common_causes, self._treatment)
+            treatment_reshaped = np.ravel(self._treatment)
+            self.propensity_score_model.fit(self._observed_common_causes, treatment_reshaped)
             self._data[self.propensity_score_column] = self.propensity_score_model.predict_proba(self._observed_common_causes)[:, 1]
         else:
             # check if user provides the propensity score column

diff --git a/dowhy/causal_model.py b/dowhy/causal_model.py
@@ -387,7 +387,7 @@ def do(self, x, identified_estimand, method_name=None,
                 raise NotImplementedError
         return estimate
 
-    def refute_estimate(self, estimand, estimate, method_name=None, **kwargs):
+    def refute_estimate(self, estimand, estimate, method_name=None, show_progress_bar=False, **kwargs):
         """Refute an estimated causal effect.
 
         If method_name is provided, uses the provided method. In the future, we may support automatic selection of suitable refutation tests. Following refutation methods are supported.
@@ -399,6 +399,7 @@ def refute_estimate(self, estimand, estimate, method_name=None, **kwargs):
         :param estimand: target estimand, an instance of the IdentifiedEstimand class (typically, the output of identify_effect)
         :param estimate: estimate to be refuted, an instance of the CausalEstimate class (typically, the output of estimate_effect)
         :param method_name: name of the refutation method
+        :param show_progress_bar: Boolean flag on whether to show a progress bar
         :param kwargs:  (optional) additional arguments that are passed directly to the refutation method. Can specify a random seed here to ensure reproducible results ('random_seed' parameter). For method-specific parameters, consult the documentation for the specific method. All refutation methods are in the causal_refuters subpackage.
 
         :returns: an instance of the RefuteResult class
@@ -418,7 +419,7 @@ def refute_estimate(self, estimand, estimate, method_name=None, **kwargs):
             estimate=estimate,
             **kwargs
         )
-        res = refuter.refute_estimate()
+        res = refuter.refute_estimate(show_progress_bar)
         return res
 
     def view_model(self, layout="dot", size=(8, 6), file_name="causal_model"):

diff --git a/dowhy/causal_refuter.py b/dowhy/causal_refuter.py
@@ -16,6 +16,7 @@ class CausalRefuter:
     """
     # Default value for the number of simulations to be conducted
     DEFAULT_NUM_SIMULATIONS = 100
+    PROGRESS_BAR_COLOR = 'green'
 
     def __init__(self, data, identified_estimand, estimate, **kwargs):
         self._data = data
@@ -214,7 +215,7 @@ def perform_normal_distribution_test(self, estimate, simulations):
 
         return p_value
 
-    def refute_estimate(self):
+    def refute_estimate(self, show_progress_bar=False):
         raise NotImplementedError
 
 

diff --git a/dowhy/causal_refuters/add_unobserved_common_cause.py b/dowhy/causal_refuters/add_unobserved_common_cause.py
@@ -4,6 +4,8 @@
 import pandas as pd
 import scipy.stats
 
+from tqdm.auto import tqdm
+
 import math
 import statsmodels.api as sm
 from sklearn.preprocessing import StandardScaler
@@ -179,7 +181,7 @@ def infer_default_kappa_y(self, len_kappa_y = 10):
         else:
             return np.arange(min_coeff, max_coeff, step)
 
-    def refute_estimate(self):
+    def refute_estimate(self, show_progress_bar=False):
         """
         This function attempts to add an unobserved common cause to the outcome and the treatment. At present, we have implemented the behavior for one dimensional behaviors for continuous
         and binary variables. This function can either take single valued inputs or a range of inputs. The function then looks at the data type of the input and then decides on the course of
@@ -229,7 +231,8 @@ def refute_estimate(self):
 
                 results_matrix = np.random.rand(len(self.kappa_t),len(self.kappa_y)) # Matrix to hold all the results of NxM
                 orig_data = copy.deepcopy(self._data)
-                for i in range(len(self.kappa_t)):
+
+                for i in tqdm(range(len(self.kappa_t)), colour=CausalRefuter.PROGRESS_BAR_COLOR, disable = not show_progress_bar, desc="Refuting Estimates: "):
                     for j in range(len(self.kappa_y)):
                         new_data = self.include_confounders_effect(orig_data, self.kappa_t[i], self.kappa_y[j])
                         new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate)
@@ -282,7 +285,7 @@ def refute_estimate(self):
                 outcomes = np.random.rand(len(self.kappa_t))
                 orig_data = copy.deepcopy(self._data)
 
-                for i in range(0,len(self.kappa_t)):
+                for i in tqdm(range(0,len(self.kappa_t)), colour=CausalRefuter.PROGRESS_BAR_COLOR, disable = not show_progress_bar, desc="Refuting Estimates: "):
                     new_data = self.include_confounders_effect(orig_data, self.kappa_t[i], self.kappa_y)
                     new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate)
                     new_effect = new_estimator.estimate_effect()
@@ -316,7 +319,7 @@ def refute_estimate(self):
                 outcomes = np.random.rand(len(self.kappa_y))
                 orig_data = copy.deepcopy(self._data)
 
-                for i in range(0, len(self.kappa_y)):
+                for i in tqdm(range(0,len(self.kappa_y)), colour=CausalRefuter.PROGRESS_BAR_COLOR, disable = not show_progress_bar, desc="Refuting Estimates: "):
                     new_data = self.include_confounders_effect(orig_data, self.kappa_t, self.kappa_y[i])
                     new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate)
                     new_effect = new_estimator.estimate_effect()

diff --git a/dowhy/causal_refuters/data_subset_refuter.py b/dowhy/causal_refuters/data_subset_refuter.py
@@ -2,6 +2,8 @@
 import logging
 from joblib import Parallel, delayed
 
+from tqdm.auto import tqdm
+
 from dowhy.causal_refuter import CausalRefuter, CausalRefutation
 from dowhy.causal_estimator import CausalEstimator
 
@@ -37,7 +39,7 @@ def __init__(self, *args, **kwargs):
 
         self.logger = logging.getLogger(__name__)
 
-    def refute_estimate(self):
+    def refute_estimate(self, show_progress_bar=False):
 
         sample_estimates = np.zeros(self._num_simulations)
         self.logger.info("Refutation over {} simulated datasets of size {} each"
@@ -60,7 +62,7 @@ def refute_once():
         sample_estimates = Parallel(
             n_jobs=self._n_jobs,
             verbose=self._verbose
-        )(delayed(refute_once)() for _ in range(self._num_simulations))
+        )(delayed(refute_once)() for _ in tqdm(range(self._num_simulations), colour=CausalRefuter.PROGRESS_BAR_COLOR, disable = not show_progress_bar, desc="Refuting Estimates: "))
         sample_estimates = np.array(sample_estimates)
 
         refute = CausalRefutation(

diff --git a/dowhy/causal_refuters/dummy_outcome_refuter.py b/dowhy/causal_refuters/dummy_outcome_refuter.py
@@ -4,6 +4,8 @@
 import pandas as pd
 import logging
 import pdb
+
+from tqdm.auto import tqdm
 from collections import OrderedDict, namedtuple
 from dowhy.causal_refuter import CausalRefutation
 from dowhy.causal_refuter import CausalRefuter
@@ -214,7 +216,7 @@ def __init__(self, *args, **kwargs):
         self._outcome_name_str = self._outcome_name[0]
         self.logger = logging.getLogger(__name__)
 
-    def refute_estimate(self):
+    def refute_estimate(self, show_progress_bar=False):
 
         # We need to change the identified estimand
         # We thus, make a copy. This is done as we don't want
@@ -238,7 +240,8 @@ def refute_estimate(self):
         # Train and the Validation Datasets. Thus, we run the simulation loop followed by the training and the validation
         # loops. Thus, we can get different values everytime we get the estimator.
 
-        for _ in range( self._num_simulations ):
+        # for _ in range( self._num_simulations ):
+        for _ in tqdm(range(self._num_simulations), colour=CausalRefuter.PROGRESS_BAR_COLOR, disable = not show_progress_bar, desc="Refuting Estimates: "):
             estimates = []
 
             if estimator_present == False:

diff --git a/dowhy/causal_refuters/placebo_treatment_refuter.py b/dowhy/causal_refuters/placebo_treatment_refuter.py
@@ -5,6 +5,7 @@
 import logging
 from joblib import Parallel, delayed
 
+from tqdm.auto import tqdm
 
 from dowhy.causal_refuter import CausalRefutation
 from dowhy.causal_refuter import CausalRefuter
@@ -52,7 +53,7 @@ def __init__(self, *args, **kwargs):
         self.logger = logging.getLogger(__name__)
 
 
-    def refute_estimate(self):
+    def refute_estimate(self, show_progress_bar=False):
         # only permute is supported for iv methods
         if self._target_estimand.identifier_method.startswith("iv"):
             if self._placebo_type != "permute":
@@ -145,7 +146,9 @@ def refute_once():
         sample_estimates = Parallel(
             n_jobs=self._n_jobs, 
             verbose=self._verbose
-        )(delayed(refute_once)() for _ in range(self._num_simulations))
+        )(delayed(refute_once)() for _ in tqdm(range(self._num_simulations), disable = not show_progress_bar, colour=CausalRefuter.PROGRESS_BAR_COLOR, desc="Refuting Estimates: "))
+
+        # for _ in range(self._num_simulations))
         sample_estimates = np.array(sample_estimates)
 
         # Restoring the value of iv_instrument_name

diff --git a/dowhy/causal_refuters/random_common_cause.py b/dowhy/causal_refuters/random_common_cause.py
@@ -4,6 +4,8 @@
 import logging
 from joblib import Parallel, delayed
 
+from tqdm.auto import tqdm
+
 from dowhy.causal_refuter import CausalRefutation
 from dowhy.causal_refuter import CausalRefuter
 from dowhy.causal_estimator import CausalEstimator
@@ -33,7 +35,7 @@ def __init__(self, *args, **kwargs):
 
         self.logger = logging.getLogger(__name__)
 
-    def refute_estimate(self):
+    def refute_estimate(self, show_progress_bar=False):
         num_rows = self._data.shape[0]
         self.logger.info("Refutation over {} simulated datasets, each with a random common cause added"
                          .format(self._num_simulations))
@@ -58,7 +60,7 @@ def refute_once():
         sample_estimates = Parallel(
             n_jobs=self._n_jobs,
             verbose=self._verbose
-        )(delayed(refute_once)() for _ in range(self._num_simulations))
+        )(delayed(refute_once)() for _ in tqdm(range(self._num_simulations), colour=CausalRefuter.PROGRESS_BAR_COLOR, disable = not show_progress_bar, desc="Refuting Estimates: "))
         sample_estimates = np.array(sample_estimates)
 
         refute = CausalRefutation(