From 09af170cdebc63f095475154d9505272d3e407ba Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Wed, 29 Jun 2022 19:27:31 -0600
Subject: [PATCH 01/84] Fix "WARNING: Title underline too short." message in
 rst files

---
 sphinx/source/base_classes.rst           | 12 ++++++------
 sphinx/source/concept.rst                | 10 +++++-----
 sphinx/source/deconvolution.rst          |  2 +-
 sphinx/source/feature_ablation.rst       |  2 +-
 sphinx/source/feature_permutation.rst    |  2 +-
 sphinx/source/guided_backprop.rst        |  2 +-
 sphinx/source/guided_grad_cam.rst        |  2 +-
 sphinx/source/influence.rst              | 14 +++++++-------
 sphinx/source/input_x_gradient.rst       |  2 +-
 sphinx/source/layer.rst                  | 24 ++++++++++++------------
 sphinx/source/metrics.rst                |  6 +++---
 sphinx/source/neuron.rst                 | 20 ++++++++++----------
 sphinx/source/robust.rst                 | 10 +++++-----
 sphinx/source/shapley_value_sampling.rst |  2 +-
 14 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/sphinx/source/base_classes.rst b/sphinx/source/base_classes.rst
index c337d666fc..a1f3d8117b 100644
--- a/sphinx/source/base_classes.rst
+++ b/sphinx/source/base_classes.rst
@@ -1,32 +1,32 @@
 Base Classes
-==========
+========================
 
 Attribution
-^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.Attribution
     :members:
 
 Layer Attribution
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerAttribution
     :members:
 
 Neuron Attribution
-^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.NeuronAttribution
     :members:
 
 Gradient Attribution
-^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.GradientAttribution
     :members:
 
 Perturbation Attribution
-^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.PerturbationAttribution
     :members:
diff --git a/sphinx/source/concept.rst b/sphinx/source/concept.rst
index 7aa60aabb9..19157398b7 100644
--- a/sphinx/source/concept.rst
+++ b/sphinx/source/concept.rst
@@ -1,29 +1,29 @@
 Concept-based Interpretability
-======
+==============================
 
 TCAV
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.concept.TCAV
     :members:
 
 
 ConceptInterpreter
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.concept.ConceptInterpreter
     :members:
 
 
 Concept
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.concept.Concept
     :members:
 
 
 Classifier
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.concept.Classifier
     :members:
diff --git a/sphinx/source/deconvolution.rst b/sphinx/source/deconvolution.rst
index 61e092e768..d5813d3842 100644
--- a/sphinx/source/deconvolution.rst
+++ b/sphinx/source/deconvolution.rst
@@ -1,5 +1,5 @@
 Deconvolution
-=========
+=============
 
 .. autoclass:: captum.attr.Deconvolution
     :members:
diff --git a/sphinx/source/feature_ablation.rst b/sphinx/source/feature_ablation.rst
index 35484a0fe6..05467941f3 100644
--- a/sphinx/source/feature_ablation.rst
+++ b/sphinx/source/feature_ablation.rst
@@ -1,5 +1,5 @@
 Feature Ablation
-=========
+================
 
 .. autoclass:: captum.attr.FeatureAblation
     :members:
diff --git a/sphinx/source/feature_permutation.rst b/sphinx/source/feature_permutation.rst
index d58f625aee..6387691cd1 100644
--- a/sphinx/source/feature_permutation.rst
+++ b/sphinx/source/feature_permutation.rst
@@ -1,5 +1,5 @@
 Feature Permutation
-=========
+===================
 
 .. autoclass:: captum.attr.FeaturePermutation
     :members:
diff --git a/sphinx/source/guided_backprop.rst b/sphinx/source/guided_backprop.rst
index 6ef3a947ae..4c0685e8c5 100644
--- a/sphinx/source/guided_backprop.rst
+++ b/sphinx/source/guided_backprop.rst
@@ -1,5 +1,5 @@
 Guided Backprop
-=========
+===============
 
 .. autoclass:: captum.attr.GuidedBackprop
     :members:
diff --git a/sphinx/source/guided_grad_cam.rst b/sphinx/source/guided_grad_cam.rst
index 99f18d2af1..207d8e55fa 100644
--- a/sphinx/source/guided_grad_cam.rst
+++ b/sphinx/source/guided_grad_cam.rst
@@ -1,5 +1,5 @@
 Guided GradCAM
-=========
+==============
 
 .. autoclass:: captum.attr.GuidedGradCam
     :members:
diff --git a/sphinx/source/influence.rst b/sphinx/source/influence.rst
index 6366924a70..6b906d8c47 100644
--- a/sphinx/source/influence.rst
+++ b/sphinx/source/influence.rst
@@ -1,41 +1,41 @@
 Influential Examples
-======
+====================
 
 DataInfluence
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.influence.DataInfluence
     :members:
 
 
 SimilarityInfluence
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.influence.SimilarityInfluence
     :members:
 
 
 TracInCPBase
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.influence.TracInCPBase
     :members:
 
 
 TracInCP
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.influence.TracInCP
     :members:
 
 TracInCPFast
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.influence.TracInCPFast
     :members:
 
 TracInCPFastRandProj
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.influence.TracInCPFastRandProj
     :members:
diff --git a/sphinx/source/input_x_gradient.rst b/sphinx/source/input_x_gradient.rst
index cd5f222e27..5213eab69b 100644
--- a/sphinx/source/input_x_gradient.rst
+++ b/sphinx/source/input_x_gradient.rst
@@ -1,5 +1,5 @@
 Input X Gradient
-===============
+================
 
 .. autoclass:: captum.attr.InputXGradient
     :members:
diff --git a/sphinx/source/layer.rst b/sphinx/source/layer.rst
index 7fbbd5bd85..466fbd97d2 100644
--- a/sphinx/source/layer.rst
+++ b/sphinx/source/layer.rst
@@ -1,70 +1,70 @@
 Layer Attribution
-======
+===========================
 
 Layer Conductance
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerConductance
     :members:
 
 
 Layer Activation
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerActivation
     :members:
 
 Internal Influence
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.InternalInfluence
     :members:
 
 Layer Gradient X Activation
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerGradientXActivation
     :members:
 
 GradCAM
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerGradCam
     :members:
 
 Layer DeepLift
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerDeepLift
     :members:
 
 Layer DeepLiftShap
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerDeepLiftShap
     :members:
 
 Layer GradientShap
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerGradientShap
     :members:
 
 Layer Integrated Gradients
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerIntegratedGradients
     :members:
 
 Layer Feature Ablation
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerFeatureAblation
     :members:
 
 
 Layer LRP
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.LayerLRP
     :members:
diff --git a/sphinx/source/metrics.rst b/sphinx/source/metrics.rst
index 47c11e4856..8e71a40b02 100644
--- a/sphinx/source/metrics.rst
+++ b/sphinx/source/metrics.rst
@@ -1,15 +1,15 @@
 Metrics
-======
+===========
 
 Infidelity
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^
 
 .. autoclass:: captum.metrics.infidelity
     :members:
 
 
 Sensitivity
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^
 
 .. autoclass:: captum.metrics.sensitivity_max
     :members:
diff --git a/sphinx/source/neuron.rst b/sphinx/source/neuron.rst
index 8ad1514378..6f894df028 100644
--- a/sphinx/source/neuron.rst
+++ b/sphinx/source/neuron.rst
@@ -1,56 +1,56 @@
 Neuron Attribution
-=======
+===========================
 
 Neuron Gradient
-^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.NeuronGradient
     :members:
 
 Neuron Integrated Gradients
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.NeuronIntegratedGradients
     :members:
 
 Neuron Conductance
-^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.NeuronConductance
     :members:
 
 Neuron DeepLift
-^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.NeuronDeepLift
     :members:
 
 Neuron DeepLiftShap
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.NeuronDeepLiftShap
     :members:
 
 Neuron GradientShap
-^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.NeuronGradientShap
     :members:
 
 Neuron Guided Backprop
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.NeuronGuidedBackprop
     :members:
 
 Neuron Deconvolution
-^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.NeuronDeconvolution
     :members:
 
 Neuron Feature Ablation
-^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.attr.NeuronFeatureAblation
     :members:
diff --git a/sphinx/source/robust.rst b/sphinx/source/robust.rst
index 3b90a32ae5..48b360ad80 100644
--- a/sphinx/source/robust.rst
+++ b/sphinx/source/robust.rst
@@ -1,29 +1,29 @@
 Robustness
-======
+======================
 
 FGSM
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.robust.FGSM
     :members:
 
 
 PGD
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.robust.PGD
     :members:
 
 
 Attack Comparator
-^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.robust.AttackComparator
     :members:
 
 
 Min Param Perturbation
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: captum.robust.MinParamPerturbation
     :members:
diff --git a/sphinx/source/shapley_value_sampling.rst b/sphinx/source/shapley_value_sampling.rst
index c998125af9..667874d805 100644
--- a/sphinx/source/shapley_value_sampling.rst
+++ b/sphinx/source/shapley_value_sampling.rst
@@ -1,5 +1,5 @@
 Shapley Value Sampling
-=========
+======================
 
 .. autoclass:: captum.attr.ShapleyValueSampling
     :members:

From aa953fd97ce11be6e157f2f6a1adade955f8ce4a Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 30 Jun 2022 13:31:10 -0600
Subject: [PATCH 02/84] Fix Sphinx bullet list spacing warning

---
 captum/attr/_core/feature_ablation.py               | 1 +
 captum/attr/_core/integrated_gradients.py           | 1 +
 captum/attr/_core/kernel_shap.py                    | 1 +
 captum/attr/_core/layer/layer_conductance.py        | 1 +
 captum/attr/_core/layer/layer_deep_lift.py          | 1 +
 captum/attr/_core/lime.py                           | 1 +
 captum/attr/_core/neuron/neuron_feature_ablation.py | 2 ++
 captum/attr/_core/occlusion.py                      | 1 +
 captum/attr/_core/shapley_value.py                  | 2 ++
 9 files changed, 11 insertions(+)

diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index fd0007fc75..a780775c59 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -101,6 +101,7 @@ def attribute(
                           - or a scalar, corresponding to a tensor in the
                             inputs' tuple. This scalar value is broadcasted
                             for corresponding input tensor.
+
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
diff --git a/captum/attr/_core/integrated_gradients.py b/captum/attr/_core/integrated_gradients.py
index e96a826c32..0421c79339 100644
--- a/captum/attr/_core/integrated_gradients.py
+++ b/captum/attr/_core/integrated_gradients.py
@@ -162,6 +162,7 @@ def attribute(  # type: ignore
                           - or a scalar, corresponding to a tensor in the
                             inputs' tuple. This scalar value is broadcasted
                             for corresponding input tensor.
+
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
 
diff --git a/captum/attr/_core/kernel_shap.py b/captum/attr/_core/kernel_shap.py
index 2826b30dfe..9c537d64b8 100644
--- a/captum/attr/_core/kernel_shap.py
+++ b/captum/attr/_core/kernel_shap.py
@@ -120,6 +120,7 @@ def attribute(  # type: ignore
                           - or a scalar, corresponding to a tensor in the
                             inputs' tuple. This scalar value is broadcasted
                             for corresponding input tensor.
+
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index 3d76569c10..8b98ce4e19 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -152,6 +152,7 @@ def attribute(
                           - or a scalar, corresponding to a tensor in the
                             inputs' tuple. This scalar value is broadcasted
                             for corresponding input tensor.
+
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
 
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index 71a8e9eb29..dc036b6491 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -180,6 +180,7 @@ def attribute(
                           - or a scalar, corresponding to a tensor in the
                             inputs' tuple. This scalar value is broadcasted
                             for corresponding input tensor.
+
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
 
diff --git a/captum/attr/_core/lime.py b/captum/attr/_core/lime.py
index 520251ce53..241013852a 100644
--- a/captum/attr/_core/lime.py
+++ b/captum/attr/_core/lime.py
@@ -913,6 +913,7 @@ def attribute(  # type: ignore
                           - or a scalar, corresponding to a tensor in the
                             inputs' tuple. This scalar value is broadcasted
                             for corresponding input tensor.
+
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index d706f71cb4..a5186679f6 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -108,6 +108,7 @@ def attribute(
                           this function returns either a tensor with one element
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
+
             baselines (scalar, tensor, tuple of scalars or tensors, optional):
                         Baselines define reference value which replaces each
                         feature when ablated.
@@ -132,6 +133,7 @@ def attribute(
                           - or a scalar, corresponding to a tensor in the
                             inputs' tuple. This scalar value is broadcasted
                             for corresponding input tensor.
+
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index de148693fa..79dfe1a251 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -124,6 +124,7 @@ def attribute(  # type: ignore
                               - or a scalar, corresponding to a tensor in the
                                 inputs' tuple. This scalar value is broadcasted
                                 for corresponding input tensor.
+
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
diff --git a/captum/attr/_core/shapley_value.py b/captum/attr/_core/shapley_value.py
index 72af4e7237..622d0469f0 100644
--- a/captum/attr/_core/shapley_value.py
+++ b/captum/attr/_core/shapley_value.py
@@ -131,6 +131,7 @@ def attribute(
                               - or a scalar, corresponding to a tensor in the
                                 inputs' tuple. This scalar value is broadcasted
                                 for corresponding input tensor.
+
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
@@ -583,6 +584,7 @@ def attribute(
                               - or a scalar, corresponding to a tensor in the
                                 inputs' tuple. This scalar value is broadcasted
                                 for corresponding input tensor.
+
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None

From 294a01acc8907ab1c6f874bb1ed7057ddcceae3c Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Tue, 5 Jul 2022 12:22:04 -0600
Subject: [PATCH 03/84] Fix automodule path

---
 sphinx/source/common.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sphinx/source/common.rst b/sphinx/source/common.rst
index 711a7e6fe5..9fc3682266 100644
--- a/sphinx/source/common.rst
+++ b/sphinx/source/common.rst
@@ -1,7 +1,7 @@
 Captum.Utils
 ============
 
-.. automodule:: captum.attr._utils.common
+.. automodule:: captum._utils.common
 
 .. autofunction:: validate_input
 .. autofunction:: validate_noise_tunnel_type

From 9c24eb90c563c69b3600f2185aaae45a3315ca87 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Tue, 5 Jul 2022 13:43:26 -0600
Subject: [PATCH 04/84] Fix more Sphinx warnings

---
 captum/attr/_core/kernel_shap.py                  |  2 ++
 .../_core/layer/layer_integrated_gradients.py     |  5 +++++
 captum/attr/_core/lrp.py                          |  5 ++++-
 captum/attr/_core/neuron/neuron_deep_lift.py      |  2 ++
 captum/attr/_core/noise_tunnel.py                 |  1 +
 captum/attr/_utils/visualization.py               |  2 ++
 captum/metrics/_core/infidelity.py                |  1 +
 sphinx/source/common.rst                          | 15 +++++++++------
 sphinx/source/gradient_shap.rst                   |  3 ---
 9 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/captum/attr/_core/kernel_shap.py b/captum/attr/_core/kernel_shap.py
index 9c537d64b8..1705a77770 100644
--- a/captum/attr/_core/kernel_shap.py
+++ b/captum/attr/_core/kernel_shap.py
@@ -317,7 +317,9 @@ def kernel_shap_perturb_generator(
         Perturbations are sampled by the following process:
          - Choose k (number of selected features), based on the distribution
                 p(k) = (M - 1) / (k * (M - k))
+
             where M is the total number of features in the interpretable space
+
          - Randomly select a binary vector with k ones, each sample is equally
             likely. This is done by generating a random vector of normal
             values and thresholding based on the top k elements.
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 2e769a5658..399bc14765 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -261,11 +261,13 @@ def attribute(
                         tensors or any arbitrary python types. These arguments
                         are provided to forward_func in order following the
                         arguments in inputs.
+
                         For a tensor, the first dimension of the tensor must
                         correspond to the number of examples. It will be
                         repeated for each of `n_steps` along the integrated
                         path. For all other types, the given argument is used
                         for all forward evaluations.
+
                         Note that attributions are not computed with respect
                         to these arguments.
                         Default: None
@@ -280,6 +282,7 @@ def attribute(
                         which are computed (forward / backward passes)
                         sequentially. internal_batch_size must be at least equal to
                         #examples.
+
                         For DataParallel models, each batch is split among the
                         available devices, so evaluations on each available
                         device contain internal_batch_size / num_devices examples.
@@ -297,11 +300,13 @@ def attribute(
                         then the attributions will be computed with respect to
                         layer input, otherwise it will be computed with respect
                         to layer output.
+
                         Note that currently it is assumed that either the input
                         or the output of internal layer, depending on whether we
                         attribute to the input or output, is a single tensor.
                         Support for multiple tensors will be added later.
                         Default: False
+
             Returns:
                 **attributions** or 2-element tuple of **attributions**, **delta**:
                 - **attributions** (*tensor*, tuple of *tensors* or tuple of *tensors*):
diff --git a/captum/attr/_core/lrp.py b/captum/attr/_core/lrp.py
index e11d0b8544..77b4b825ca 100644
--- a/captum/attr/_core/lrp.py
+++ b/captum/attr/_core/lrp.py
@@ -106,6 +106,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
+
             target (int, tuple, tensor or list, optional):  Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
@@ -154,7 +155,7 @@ def attribute(
 
         Returns:
             *tensor* or tuple of *tensors* of **attributions**
-            or 2-element tuple of **attributions**, **delta**::
+            or 2-element tuple of **attributions**, **delta**:
             - **attributions** (*tensor* or tuple of *tensors*):
                         The propagated relevance values with respect to each
                         input feature. The values are normalized by the output score
@@ -168,10 +169,12 @@ def attribute(
                         corresponding sized tensors is returned. The sum of attributions
                         is one and not corresponding to the prediction score as in other
                         implementations.
+
             - **delta** (*tensor*, returned if return_convergence_delta=True):
                         Delta is calculated per example, meaning that the number of
                         elements in returned delta tensor is equal to the number of
                         of examples in the inputs.
+
         Examples::
 
                 >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index aff216d37a..6096dd72ca 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -279,6 +279,7 @@ class NeuronDeepLiftShap(NeuronAttribution, GradientAttribution):
         1. Assumes that input features are independent of one another
         2. Is linear, meaning that the explanations are modeled through
             the additive composition of feature effects.
+
     Although, it assumes a linear model for each explanation, the overall
     model across multiple explanations can be complex and non-linear.
     """
@@ -376,6 +377,7 @@ def attribute(
                           this function returns either a tensor with one element
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
+
             baselines (tensor, tuple of tensors, callable):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 0fbc32115e..cccb3f303c 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -47,6 +47,7 @@ class NoiseTunnel(Attribution):
         https://arxiv.org/abs/1810.03307
         https://arxiv.org/abs/1706.03825
         https://arxiv.org/pdf/1806.10758
+
     This method currently also supports batches of multiple examples input,
     however it can be computationally expensive depending on the model,
     the dimensionality of the data and execution environment.
diff --git a/captum/attr/_utils/visualization.py b/captum/attr/_utils/visualization.py
index 2db9026872..76677a9781 100644
--- a/captum/attr/_utils/visualization.py
+++ b/captum/attr/_utils/visualization.py
@@ -132,6 +132,7 @@ def visualize_image_attr(
 
                     5. `alpha_scaling` - Sets alpha channel of each pixel
                        to be equal to normalized attribution value.
+
                     Default: `heat_map`
         sign (string, optional): Chosen sign of attributions to visualize. Supported
                     options are:
@@ -147,6 +148,7 @@ def visualize_image_attr(
                        values. This is not supported for `masked_image` or
                        `alpha_scaling` modes, since signed information cannot
                        be represented in these modes.
+
                     Default: `absolute_value`
         plt_fig_axis (tuple, optional): Tuple of matplotlib.pyplot.figure and axis
                     on which to visualize. If None is provided, then a new figure
diff --git a/captum/metrics/_core/infidelity.py b/captum/metrics/_core/infidelity.py
index 33f485a78e..e90e88ce09 100644
--- a/captum/metrics/_core/infidelity.py
+++ b/captum/metrics/_core/infidelity.py
@@ -205,6 +205,7 @@ def infidelity(
                    Similar to previous case here as well we need to return only
                    perturbed inputs in case `infidelity_perturb_func_decorator`
                    decorates out `perturb_func`.
+
                 It is important to note that for performance reasons `perturb_func`
                 isn't called for each example individually but on a batch of
                 input examples that are repeated `max_examples_per_batch / batch_size`
diff --git a/sphinx/source/common.rst b/sphinx/source/common.rst
index 9fc3682266..7abf6a382a 100644
--- a/sphinx/source/common.rst
+++ b/sphinx/source/common.rst
@@ -1,12 +1,15 @@
 Captum.Utils
 ============
 
-.. automodule:: captum._utils.common
+.. automodule:: captum.attr._utils.common
 
-.. autofunction:: validate_input
-.. autofunction:: validate_noise_tunnel_type
-.. autofunction:: format_input
-.. autofunction:: _format_attributions
-.. autofunction:: zeros
+.. autofunction:: _validate_input
+.. autofunction:: _validate_noise_tunnel_type
 .. autofunction:: _reshape_and_sum
+
+.. currentmodule:: captum._utils.common
+
+.. autofunction:: _format_inputs
+.. autofunction:: _format_output
+.. autofunction:: _zeros
 .. autofunction:: _run_forward
diff --git a/sphinx/source/gradient_shap.rst b/sphinx/source/gradient_shap.rst
index 2a676dcb06..8d94c31463 100644
--- a/sphinx/source/gradient_shap.rst
+++ b/sphinx/source/gradient_shap.rst
@@ -3,6 +3,3 @@ GradientShap
 
 .. autoclass:: captum.attr.GradientShap
     :members:
-
-.. autoclass:: captum.attr.InputBaselineXGradient
-    :members:

From a5c5eb7d597a6264646588b2adba8a6b100cf2bb Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 17 Jul 2022 12:40:36 -0600
Subject: [PATCH 05/84] Docstring fix: string -> str

---
 captum/attr/_core/integrated_gradients.py          |  2 +-
 captum/attr/_core/layer/internal_influence.py      |  2 +-
 captum/attr/_core/layer/layer_conductance.py       |  2 +-
 .../attr/_core/layer/layer_integrated_gradients.py |  2 +-
 captum/attr/_core/neuron/neuron_conductance.py     |  2 +-
 .../_core/neuron/neuron_integrated_gradients.py    |  2 +-
 captum/attr/_core/noise_tunnel.py                  |  2 +-
 captum/attr/_utils/visualization.py                | 14 +++++++-------
 8 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/captum/attr/_core/integrated_gradients.py b/captum/attr/_core/integrated_gradients.py
index 0421c79339..a23b9346ab 100644
--- a/captum/attr/_core/integrated_gradients.py
+++ b/captum/attr/_core/integrated_gradients.py
@@ -211,7 +211,7 @@ def attribute(  # type: ignore
                         Default: None
             n_steps (int, optional): The number of steps used by the approximation
                         method. Default: 50.
-            method (string, optional): Method for approximating the integral,
+            method (str, optional): Method for approximating the integral,
                         one of `riemann_right`, `riemann_left`, `riemann_middle`,
                         `riemann_trapezoid` or `gausslegendre`.
                         Default: `gausslegendre` if no method is provided.
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index 8976fe7344..67d6f58505 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -159,7 +159,7 @@ def attribute(
                         Default: None
             n_steps (int, optional): The number of steps used by the approximation
                         method. Default: 50.
-            method (string, optional): Method for approximating the integral,
+            method (str, optional): Method for approximating the integral,
                         one of `riemann_right`, `riemann_left`, `riemann_middle`,
                         `riemann_trapezoid` or `gausslegendre`.
                         Default: `gausslegendre` if no method is provided.
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index 8b98ce4e19..300b9418f9 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -201,7 +201,7 @@ def attribute(
                         Default: None
             n_steps (int, optional): The number of steps used by the approximation
                         method. Default: 50.
-            method (string, optional): Method for approximating the integral,
+            method (str, optional): Method for approximating the integral,
                         one of `riemann_right`, `riemann_left`, `riemann_middle`,
                         `riemann_trapezoid` or `gausslegendre`.
                         Default: `gausslegendre` if no method is provided.
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 399bc14765..acf8ae0baf 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -273,7 +273,7 @@ def attribute(
                         Default: None
             n_steps (int, optional): The number of steps used by the approximation
                         method. Default: 50.
-            method (string, optional): Method for approximating the integral,
+            method (str, optional): Method for approximating the integral,
                         one of `riemann_right`, `riemann_left`, `riemann_middle`,
                         `riemann_trapezoid` or `gausslegendre`.
                         Default: `gausslegendre` if no method is provided.
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index dec6b39b01..b68cce274a 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -216,7 +216,7 @@ def attribute(
                         Default: None
             n_steps (int, optional): The number of steps used by the approximation
                         method. Default: 50.
-            method (string, optional): Method for approximating the integral,
+            method (str, optional): Method for approximating the integral,
                         one of `riemann_right`, `riemann_left`, `riemann_middle`,
                         `riemann_trapezoid` or `gausslegendre`.
                         Default: `gausslegendre` if no method is provided.
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index f67aec7e7e..e97f2fc24f 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -174,7 +174,7 @@ def attribute(
                         Default: None
             n_steps (int, optional): The number of steps used by the approximation
                         method. Default: 50.
-            method (string, optional): Method for approximating the integral,
+            method (str, optional): Method for approximating the integral,
                         one of `riemann_right`, `riemann_left`, `riemann_middle`,
                         `riemann_trapezoid` or `gausslegendre`.
                         Default: `gausslegendre` if no method is provided.
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index cccb3f303c..86d988119c 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -102,7 +102,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            nt_type (string, optional): Smoothing type of the attributions.
+            nt_type (str, optional): Smoothing type of the attributions.
                         `smoothgrad`, `smoothgrad_sq` or `vargrad`
                         Default: `smoothgrad` if `type` is not provided.
             nt_samples (int, optional):  The number of randomly generated examples
diff --git a/captum/attr/_utils/visualization.py b/captum/attr/_utils/visualization.py
index 76677a9781..3e7ef327eb 100644
--- a/captum/attr/_utils/visualization.py
+++ b/captum/attr/_utils/visualization.py
@@ -116,7 +116,7 @@ def visualize_image_attr(
                     This is a necessary argument for any visualization method
                     which utilizes the original image.
                     Default: None
-        method (string, optional): Chosen method for visualizing attribution.
+        method (str, optional): Chosen method for visualizing attribution.
                     Supported options are:
 
                     1. `heat_map` - Display heat map of chosen attributions
@@ -134,7 +134,7 @@ def visualize_image_attr(
                        to be equal to normalized attribution value.
 
                     Default: `heat_map`
-        sign (string, optional): Chosen sign of attributions to visualize. Supported
+        sign (str, optional): Chosen sign of attributions to visualize. Supported
                     options are:
 
                     1. `positive` - Displays only positive pixel attributions.
@@ -161,7 +161,7 @@ def visualize_image_attr(
                     and scale value are computed using absolute value of
                     attributions.
                     Default: 2
-        cmap (string, optional): String corresponding to desired colormap for
+        cmap (str, optional): String corresponding to desired colormap for
                     heatmap visualization. This defaults to "Reds" for negative
                     sign, "Blues" for absolute value, "Greens" for positive sign,
                     and a spectrum from red to green for all. Note that this
@@ -177,7 +177,7 @@ def visualize_image_attr(
                     necessary for appropriate alignment when visualizing
                     multiple plots, some with colorbars and some without.
                     Default: False
-        title (string, optional): Title string for plot. If None, no title is
+        title (str, optional): Title string for plot. If None, no title is
                     set.
                     Default: None
         fig_size (tuple, optional): Size of figure created.
@@ -346,13 +346,13 @@ def visualize_image_attr_multiple(
                     with values in range 0-1 or 0-255. This is a necessary
                     argument for any visualization method which utilizes
                     the original image.
-        methods (list of strings): List of strings of length k, defining method
+        methods (list of str): List of strings of length k, defining method
                         for each visualization. Each method must be a valid
                         string argument for method to visualize_image_attr.
-        signs (list of strings): List of strings of length k, defining signs for
+        signs (list of str): List of strings of length k, defining signs for
                         each visualization. Each sign must be a valid
                         string argument for sign to visualize_image_attr.
-        titles (list of strings, optional):  List of strings of length k, providing
+        titles (list of str, optional):  List of strings of length k, providing
                     a title string for each plot. If None is provided, no titles
                     are added to subplots.
                     Default: None

From 3c13769b3362f99e01f5385f195529e41fe825fc Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 17 Jul 2022 14:15:13 -0600
Subject: [PATCH 06/84] function -> callable

---
 captum/attr/_core/gradient_shap.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 57d5e909af..506334c77c 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -59,7 +59,7 @@ def __init__(self, forward_func: Callable, multiply_by_inputs: bool = True) -> N
         r"""
         Args:
 
-            forward_func (function): The forward function of the model or
+            forward_func (callable): The forward function of the model or
                        any modification of it.
             multiply_by_inputs (bool, optional): Indicates whether to factor
                     model inputs' multiplier in the final attribution scores.
@@ -162,7 +162,7 @@ def attribute(
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
-            stdevs    (float, or a tuple of floats optional): The standard deviation
+            stdevs    (float, or a tuple of floats, optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
@@ -294,7 +294,7 @@ def __init__(self, forward_func: Callable, multiply_by_inputs=True) -> None:
         r"""
         Args:
 
-            forward_func (function): The forward function of the model or
+            forward_func (callable): The forward function of the model or
                         any modification of it
             multiply_by_inputs (bool, optional): Indicates whether to factor
                         model inputs' multiplier in the final attribution scores.

From 61c2f8854869e767972dc6b929ac44242d791641 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 17 Jul 2022 14:16:07 -0600
Subject: [PATCH 07/84] or a tuple of -> or tuple of

---
 captum/attr/_core/gradient_shap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 506334c77c..484e2c8b64 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -162,7 +162,7 @@ def attribute(
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
-            stdevs    (float, or a tuple of floats, optional): The standard deviation
+            stdevs    (float, or tuple of floats, optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is

From 0ace2f23f604568537270c327ca668b0cc0cc0db Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 18 Jul 2022 11:37:35 -0600
Subject: [PATCH 08/84] Fix doc style

---
 captum/_utils/av.py                                   | 8 ++++----
 captum/influence/_core/similarity_influence.py        | 4 ++--
 captum/influence/_core/tracincp.py                    | 8 ++++----
 captum/influence/_core/tracincp_fast_rand_proj.py     | 8 ++++----
 captum/influence/_utils/common.py                     | 2 +-
 captum/robust/_core/fgsm.py                           | 2 +-
 captum/robust/_core/metrics/min_param_perturbation.py | 4 ++--
 captum/robust/_core/pgd.py                            | 2 +-
 8 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/captum/_utils/av.py b/captum/_utils/av.py
index ac3c32a204..fa594abdaa 100644
--- a/captum/_utils/av.py
+++ b/captum/_utils/av.py
@@ -211,7 +211,7 @@ def save(
                     AV.generate_dataset_activations from batch index.
                     It assumes identifier is same for all layers if a list of
                     `layers` is provided.
-            layers (str or List of str): The layer(s) for which the activation vectors
+            layers (str or list of str): The layer(s) for which the activation vectors
                     are computed.
             act_tensors (Tensor or List of Tensor): A batch of activation vectors.
                     This must match the dimension of `layers`.
@@ -299,7 +299,7 @@ def _manage_loading_layers(
                     for the `layer` are stored.
             model_id (str): The name/version of the model for which layer activations
                     are being computed and stored.
-            layers (str or List of str): The layer(s) for which the activation vectors
+            layers (str or list of str): The layer(s) for which the activation vectors
                     are computed.
             identifier (str or None): An optional identifier for the layer
                     activations. Can be used to distinguish between activations for
@@ -357,7 +357,7 @@ def _compute_and_save_activations(
                     define all of its layers as attributes of the model.
             model_id (str): The name/version of the model for which layer activations
                     are being computed and stored.
-            layers (str or List of str): The layer(s) for which the activation vectors
+            layers (str or list of str): The layer(s) for which the activation vectors
                     are computed.
             inputs (tensor or tuple of tensors): Batch of examples for
                     which influential instances are computed. They are passed to the
@@ -433,7 +433,7 @@ def generate_dataset_activations(
                     define all of its layers as attributes of the model.
             model_id (str): The name/version of the model for which layer activations
                     are being computed and stored.
-            layers (str or List of str): The layer(s) for which the activation vectors
+            layers (str or list of str): The layer(s) for which the activation vectors
                     are computed.
             dataloader (torch.utils.data.DataLoader): DataLoader that yields Dataset
                     for which influential instances are computed. They are passed to
diff --git a/captum/influence/_core/similarity_influence.py b/captum/influence/_core/similarity_influence.py
index 83cb2966fa..03251de37d 100644
--- a/captum/influence/_core/similarity_influence.py
+++ b/captum/influence/_core/similarity_influence.py
@@ -82,7 +82,7 @@ def __init__(
         Args:
             module (torch.nn.Module): An instance of pytorch model. This model should
                     define all of its layers as attributes of the model.
-            layers (str or List of str): The fully qualified layer(s) for which the
+            layers (str or list of str): The fully qualified layer(s) for which the
                     activation vectors are computed.
             influence_src_dataset (torch.utils.data.Dataset): PyTorch Dataset that is
                     used to create a PyTorch Dataloader to iterate over the dataset and
@@ -94,7 +94,7 @@ def __init__(
             model_id (str): The name/version of the model for which layer
                         activations are being computed. Activations will be stored and
                         loaded under the subdirectory with this name if provided.
-            similarity_metric (Callable): This is a callable function that computes a
+            similarity_metric (callable): This is a callable function that computes a
                     similarity metric between two representations. For example, the
                     representations pair could be from the training and test sets.
 
diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index d5acc2dfef..6e8c605f8d 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -127,7 +127,7 @@ def __init__(
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
-            checkpoints_load_func (Callable, optional): The function to load a saved
+            checkpoints_load_func (callable, optional): The function to load a saved
                     checkpoint into a model to update its parameters, and get the
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
@@ -137,7 +137,7 @@ def __init__(
                     be computed for all layers. Otherwise, they will only be computed
                     for the layers specified in `layers`.
                     Default: None
-            loss_fn (Callable, optional): The loss function applied to model.
+            loss_fn (callable, optional): The loss function applied to model.
                     Default: None
             batch_size (int or None, optional): Batch size of the DataLoader created to
                     iterate through `influence_src_dataset`, if it is a Dataset.
@@ -478,7 +478,7 @@ def __init__(
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
-            checkpoints_load_func (Callable, optional): The function to load a saved
+            checkpoints_load_func (callable, optional): The function to load a saved
                     checkpoint into a model to update its parameters, and get the
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
@@ -488,7 +488,7 @@ def __init__(
                     be computed for all layers. Otherwise, they will only be computed
                     for the layers specified in `layers`.
                     Default: None
-            loss_fn (Callable, optional): The loss function applied to model. There
+            loss_fn (callable, optional): The loss function applied to model. There
                     are two options for the return type of `loss_fn`. First, `loss_fn`
                     can be a "per-example" loss function - returns a 1D Tensor of
                     losses for each example in a batch. `nn.BCELoss(reduction="none")`
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index cfbf7b47d4..a83a199da0 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -115,12 +115,12 @@ def __init__(
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
-            checkpoints_load_func (Callable, optional): The function to load a saved
+            checkpoints_load_func (callable, optional): The function to load a saved
                     checkpoint into a model to update its parameters, and get the
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
                     Default: _load_flexible_state_dict
-            loss_fn (Callable, optional): The loss function applied to model. `loss_fn`
+            loss_fn (callable, optional): The loss function applied to model. `loss_fn`
                     must be a "reduction" loss function that reduces the per-example
                     losses in a batch, and returns a single scalar Tensor. Furthermore,
                     the reduction must be the *sum* or the *mean* of the per-example
@@ -669,12 +669,12 @@ def __init__(
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
-            checkpoints_load_func (Callable, optional): The function to load a saved
+            checkpoints_load_func (callable, optional): The function to load a saved
                     checkpoint into a model to update its parameters, and get the
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
                     Default: _load_flexible_state_dict
-            loss_fn (Callable, optional): The loss function applied to model. `loss_fn`
+            loss_fn (callable, optional): The loss function applied to model. `loss_fn`
                     must be a "reduction" loss function that reduces the per-example
                     losses in a batch, and returns a single scalar Tensor. Furthermore,
                     the reduction must be the *sum* of the per-example losses. For
diff --git a/captum/influence/_utils/common.py b/captum/influence/_utils/common.py
index b86ddf9f93..2b28670c82 100644
--- a/captum/influence/_utils/common.py
+++ b/captum/influence/_utils/common.py
@@ -206,7 +206,7 @@ def _get_k_most_influential_helper(
     Args:
         influence_src_dataloader (DataLoader): The DataLoader, representing training
                 data, for which we want to compute proponents / opponents.
-        influence_batch_fn (Callable): A callable that will be called via
+        influence_batch_fn (callable): A callable that will be called via
                 `influence_batch_fn(inputs, targets, batch)`, where `batch` is a batch
                 in the `influence_src_dataloader` argument.
         inputs (Tuple of Any): A batch of examples. Does not represent labels,
diff --git a/captum/robust/_core/fgsm.py b/captum/robust/_core/fgsm.py
index f717481ccd..5cbf6a0dae 100644
--- a/captum/robust/_core/fgsm.py
+++ b/captum/robust/_core/fgsm.py
@@ -54,7 +54,7 @@ def __init__(
                         e.g. image pixels must be in the range 0-255
 
         Attributes:
-            bound (Callable): A function that bounds the input values based on
+            bound (callable): A function that bounds the input values based on
                         given lower_bound and upper_bound. Can be overwritten for
                         custom use cases if necessary.
             zero_thresh (float): The threshold below which gradient will be treated
diff --git a/captum/robust/_core/metrics/min_param_perturbation.py b/captum/robust/_core/metrics/min_param_perturbation.py
index 99308727e4..fe7391c3c3 100644
--- a/captum/robust/_core/metrics/min_param_perturbation.py
+++ b/captum/robust/_core/metrics/min_param_perturbation.py
@@ -67,7 +67,7 @@ def __init__(
                 of pytorch model or any modification of a model's forward
                 function.
 
-            attack (Perturbation or Callable): This can either be an instance
+            attack (Perturbation or callable): This can either be an instance
                 of a Captum Perturbation / Attack
                 or any other perturbation or attack function such
                 as a torchvision transform.
@@ -103,7 +103,7 @@ def __init__(
                 applied before or after preproc function.
                 Default: False
 
-            correct_fn (Callable, optional): This determines whether the perturbed input
+            correct_fn (callable, optional): This determines whether the perturbed input
                 leads to a correct or incorrect prediction. By default, this function
                 is set to the standard classification test for correctness
                 (comparing argmax of output with target), which requires model output to
diff --git a/captum/robust/_core/pgd.py b/captum/robust/_core/pgd.py
index b14239c681..151edf370c 100644
--- a/captum/robust/_core/pgd.py
+++ b/captum/robust/_core/pgd.py
@@ -55,7 +55,7 @@ def __init__(
                         e.g. image pixels must be in the range 0-255
 
         Attributes:
-            bound (Callable): A function that bounds the input values based on
+            bound (callable): A function that bounds the input values based on
                         given lower_bound and upper_bound. Can be overwritten for
                         custom use cases if necessary.
         """

From 152aae8c7e07128c2fffc0736de0372d02ac663e Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 18 Jul 2022 11:43:10 -0600
Subject: [PATCH 09/84] Fix device_ids docstring types

---
 captum/attr/_core/guided_grad_cam.py                        | 2 +-
 captum/attr/_core/layer/grad_cam.py                         | 2 +-
 captum/attr/_core/layer/internal_influence.py               | 2 +-
 captum/attr/_core/layer/layer_activation.py                 | 2 +-
 captum/attr/_core/layer/layer_conductance.py                | 2 +-
 captum/attr/_core/layer/layer_feature_ablation.py           | 2 +-
 captum/attr/_core/layer/layer_gradient_shap.py              | 4 ++--
 captum/attr/_core/layer/layer_gradient_x_activation.py      | 2 +-
 captum/attr/_core/layer/layer_integrated_gradients.py       | 2 +-
 captum/attr/_core/neuron/neuron_conductance.py              | 2 +-
 captum/attr/_core/neuron/neuron_feature_ablation.py         | 2 +-
 captum/attr/_core/neuron/neuron_gradient.py                 | 2 +-
 captum/attr/_core/neuron/neuron_gradient_shap.py            | 2 +-
 .../attr/_core/neuron/neuron_guided_backprop_deconvnet.py   | 4 ++--
 captum/attr/_core/neuron/neuron_integrated_gradients.py     | 2 +-
 captum/attr/_utils/attribution.py                           | 6 +++---
 16 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/captum/attr/_core/guided_grad_cam.py b/captum/attr/_core/guided_grad_cam.py
index f6e29c4b29..1949f68666 100644
--- a/captum/attr/_core/guided_grad_cam.py
+++ b/captum/attr/_core/guided_grad_cam.py
@@ -58,7 +58,7 @@ def __init__(
             layer (torch.nn.Module): Layer for which GradCAM attributions are computed.
                           Currently, only layers with a single tensor output are
                           supported.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/layer/grad_cam.py b/captum/attr/_core/layer/grad_cam.py
index c650409149..0820018a2b 100644
--- a/captum/attr/_core/layer/grad_cam.py
+++ b/captum/attr/_core/layer/grad_cam.py
@@ -65,7 +65,7 @@ def __init__(
                           Output size of attribute matches this layer's output
                           dimensions, except for dimension 2, which will be 1,
                           since GradCAM sums over channels.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index 67d6f58505..5f9fc0b603 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -54,7 +54,7 @@ def __init__(
                           the inputs or outputs of the layer, corresponding to
                           attribution of each neuron in the input or output of
                           this layer.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/layer/layer_activation.py b/captum/attr/_core/layer/layer_activation.py
index 86c511706b..d16555a37b 100644
--- a/captum/attr/_core/layer/layer_activation.py
+++ b/captum/attr/_core/layer/layer_activation.py
@@ -36,7 +36,7 @@ def __init__(
                           this layer. If multiple layers are provided, attributions
                           are returned as a list, each element corresponding to the
                           activations of the corresponding layer.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index 300b9418f9..6662de5858 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -57,7 +57,7 @@ def __init__(
                           the inputs or outputs of the layer, corresponding to
                           attribution of each neuron in the input or output of
                           this layer.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/layer/layer_feature_ablation.py b/captum/attr/_core/layer/layer_feature_ablation.py
index 75ac885eac..42aad97598 100644
--- a/captum/attr/_core/layer/layer_feature_ablation.py
+++ b/captum/attr/_core/layer/layer_feature_ablation.py
@@ -50,7 +50,7 @@ def __init__(
                           the inputs or outputs of the layer, corresponding to
                           attribution of each neuron in the input or output of
                           this layer.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index 9473475cdf..a7e5033779 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -75,7 +75,7 @@ def __init__(
                         the inputs or outputs of the layer, corresponding to
                         attribution of each neuron in the input or output of
                         this layer.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -343,7 +343,7 @@ def __init__(
                         the inputs or outputs of the layer, corresponding to
                         attribution of each neuron in the input or output of
                         this layer.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index a63a5d7abe..0e7b2f91cb 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -41,7 +41,7 @@ def __init__(
                           this layer. If multiple layers are provided, attributions
                           are returned as a list, each element corresponding to the
                           attributions of the corresponding layer.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index acf8ae0baf..eeb5ffdc1f 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -74,7 +74,7 @@ def __init__(
                         dependence, e.g.  if you pass in l2 you cannot pass in
                         l1 or l3.
 
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index b68cce274a..9896b36d9d 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -62,7 +62,7 @@ def __init__(
                         Currently, it is assumed that the inputs or the outputs
                         of the layer, depending on which one is used for
                         attribution, can only be a single tensor.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index a5186679f6..f5e79a5c8a 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -44,7 +44,7 @@ def __init__(
                           Currently, it is assumed that the inputs or the outputs
                           of the layer, depending on which one is used for
                           attribution, can only be a single tensor.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index 5292990bbf..6ce2b2acec 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -44,7 +44,7 @@ def __init__(
                           Currently, it is assumed that the inputs or the outputs
                           of the layer, depending on which one is used for
                           attribution, can only be a single tensor.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index 42a543b50d..6eeb90bab9 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -66,7 +66,7 @@ def __init__(
                         Currently, it is assumed that the inputs or the outputs
                         of the neurons in this layer, depending on which one is
                         used for attribution, can only be a single tensor.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 7c69aed87a..7b15a5fdea 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -48,7 +48,7 @@ def __init__(
                           Currently, it is assumed that the inputs or the outputs
                           of the layer, depending on which one is used for
                           attribution, can only be a single tensor.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
@@ -217,7 +217,7 @@ def __init__(
                           in the attribute method.
                           Currently, only layers with a single tensor output are
                           supported.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index e97f2fc24f..664b9d935c 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -44,7 +44,7 @@ def __init__(
                         Currently, it is assumed that the inputs or the outputs
                         of the layer, depending on which one is used for
                         attribution, can only be a single tensor.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index f4b6e9d35c..0eddbdf880 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -338,7 +338,7 @@ def __init__(
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
                         Output size of attribute matches that of layer output.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                         applies a DataParallel model, which allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -371,7 +371,7 @@ def __init__(
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
                         Output size of attribute matches that of layer output.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                         applies a DataParallel model, which allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -441,7 +441,7 @@ def __init__(
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
                         Output size of attribute matches that of layer output.
-            device_ids (list(int)): Device ID list, necessary only if forward_func
+            device_ids (list of int): Device ID list, necessary only if forward_func
                         applies a DataParallel model, which allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,

From ce913dccf767f0e5bf759a3a8a91106b6e526c1e Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 18 Jul 2022 17:45:32 -0600
Subject: [PATCH 10/84] Any -> any

For docstring formatting consistency
---
 captum/attr/_core/feature_ablation.py                 | 2 +-
 captum/attr/_core/feature_permutation.py              | 2 +-
 captum/attr/_core/lime.py                             | 2 +-
 captum/attr/_core/noise_tunnel.py                     | 2 +-
 captum/attr/_models/base.py                           | 8 ++++----
 captum/attr/_utils/stat.py                            | 2 +-
 captum/attr/_utils/visualization.py                   | 2 +-
 captum/concept/_core/tcav.py                          | 4 ++--
 captum/influence/_core/influence.py                   | 4 ++--
 captum/influence/_core/tracincp.py                    | 4 ++--
 captum/influence/_core/tracincp_fast_rand_proj.py     | 4 ++--
 captum/metrics/_core/sensitivity.py                   | 2 +-
 captum/robust/_core/metrics/min_param_perturbation.py | 4 ++--
 13 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index a780775c59..c3091e6612 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -187,7 +187,7 @@ def attribute(
                         (e.g. time estimation). Otherwise, it will fallback to
                         a simple output of progress.
                         Default: False
-            **kwargs (Any, optional): Any additional arguments used by child
+            **kwargs (any, optional): Any additional arguments used by child
                         classes of FeatureAblation (such as Occlusion) to construct
                         ablations. These arguments are ignored when using
                         FeatureAblation directly.
diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
index 544ff16ac6..8d77aef01d 100644
--- a/captum/attr/_core/feature_permutation.py
+++ b/captum/attr/_core/feature_permutation.py
@@ -195,7 +195,7 @@ def attribute(  # type: ignore
                             (e.g. time estimation). Otherwise, it will fallback to
                             a simple output of progress.
                             Default: False
-                **kwargs (Any, optional): Any additional arguments used by child
+                **kwargs (any, optional): Any additional arguments used by child
                             classes of FeatureAblation (such as Occlusion) to construct
                             ablations. These arguments are ignored when using
                             FeatureAblation directly.
diff --git a/captum/attr/_core/lime.py b/captum/attr/_core/lime.py
index 241013852a..68f4ae386d 100644
--- a/captum/attr/_core/lime.py
+++ b/captum/attr/_core/lime.py
@@ -335,7 +335,7 @@ def attribute(
                         (e.g. time estimation). Otherwise, it will fallback to
                         a simple output of progress.
                         Default: False
-            **kwargs (Any, optional): Any additional arguments necessary for
+            **kwargs (any, optional): Any additional arguments necessary for
                         sampling and transformation functions (provided to
                         constructor).
                         Default: None
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 86d988119c..4830b3ad12 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -129,7 +129,7 @@ def attribute(
                         randomly draw baseline samples from the `baselines`
                         distribution provided as an input tensor.
                         Default: False
-            **kwargs (Any, optional): Contains a list of arguments that are passed
+            **kwargs (any, optional): Contains a list of arguments that are passed
                         to `attribution_method` attribution algorithm.
                         Any additional arguments that should be used for the
                         chosen attribution method should be included here.
diff --git a/captum/attr/_models/base.py b/captum/attr/_models/base.py
index d57646c0da..2c24918567 100644
--- a/captum/attr/_models/base.py
+++ b/captum/attr/_models/base.py
@@ -35,7 +35,7 @@ def forward(self, *inputs, **kwargs):
 
         Args:
 
-           *inputs (Any, optional): A sequence of inputs arguments that the
+           *inputs (any, optional): A sequence of inputs arguments that the
                    forward function takes. Since forward functions can take any
                    type and number of arguments, this will ensure that we can
                    execute the forward pass using interpretable embedding layer.
@@ -43,7 +43,7 @@ def forward(self, *inputs, **kwargs):
                    argument is the embedding tensor generated using the
                    `self.embedding` layer using all input arguments provided in
                    `inputs` and `kwargs`.
-           **kwargs (Any, optional): Similar to `inputs` we want to make sure
+           **kwargs (any, optional): Similar to `inputs` we want to make sure
                    that our forward pass supports arbitrary number and type of
                    key-value arguments. If `inputs` is not provided, `kwargs` must
                    be provided and the first argument corresponds to the embedding
@@ -76,10 +76,10 @@ def indices_to_embeddings(self, *input, **kwargs):
 
         Args:
 
-            *input (Any, Optional): This can be a tensor(s) of input indices or any
+            *input (any, Optional): This can be a tensor(s) of input indices or any
                     other variable necessary to comput the embeddings. A typical
                     example of input indices are word or token indices.
-            **kwargs (Any, optional): Similar to `input` this can be any sequence
+            **kwargs (any, optional): Similar to `input` this can be any sequence
                     of key-value arguments necessary to compute final embedding
                     tensor.
         Returns:
diff --git a/captum/attr/_utils/stat.py b/captum/attr/_utils/stat.py
index 803bbc7ab7..8c643f369b 100644
--- a/captum/attr/_utils/stat.py
+++ b/captum/attr/_utils/stat.py
@@ -26,7 +26,7 @@ def __init__(self, name: Optional[str] = None, **kwargs: Any) -> None:
             name (str, optional):
                 The name of the statistic. If not provided,
                 the class name will be used alongside it's parameters
-            kwargs (Any):
+            kwargs (any):
                 Additional arguments used to construct the statistic
         """
         self.params = kwargs
diff --git a/captum/attr/_utils/visualization.py b/captum/attr/_utils/visualization.py
index 3e7ef327eb..3111771ce3 100644
--- a/captum/attr/_utils/visualization.py
+++ b/captum/attr/_utils/visualization.py
@@ -363,7 +363,7 @@ def visualize_image_attr_multiple(
                     uses Matplotlib object oriented API and simply returns a
                     figure object without showing.
                     Default: True.
-        **kwargs (Any, optional): Any additional arguments which will be passed
+        **kwargs (any, optional): Any additional arguments which will be passed
                     to every individual visualization. Such arguments include
                     `show_colorbar`, `alpha_overlay`, `cmap`, etc.
 
diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index 8b6c996856..d347ecfead 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -603,7 +603,7 @@ def interpret(
                         #output_dims - 1 elements. Each tuple is applied as the
                         target for the corresponding example.
 
-            additional_forward_args (Any, optional): Extra arguments that are passed to
+            additional_forward_args (any, optional): Extra arguments that are passed to
                      model when computing the attributions for `inputs`
                      w.r.t. layer output.
                      Default: None
@@ -613,7 +613,7 @@ def interpret(
                     `processes`. Otherwise, CAV computations will be performed
                     sequential.
                     Default:None
-            **kwargs (Any, optional): A list of arguments that are passed to layer
+            **kwargs (any, optional): A list of arguments that are passed to layer
                     attribution algorithm's attribute method. This could be for
                     example `n_steps` in case of integrated gradients.
                     Default: None
diff --git a/captum/influence/_core/influence.py b/captum/influence/_core/influence.py
index f8ef1eb882..b8e5eae357 100644
--- a/captum/influence/_core/influence.py
+++ b/captum/influence/_core/influence.py
@@ -32,7 +32,7 @@ def __init_(
     def influence(self, inputs: Any = None, **kwargs: Any) -> Any:
         r"""
         Args:
-            inputs (Any): Batch of examples for which influential
+            inputs (any): Batch of examples for which influential
                     instances are computed. They are passed to the forward_func. If
                     `inputs` if a tensor or tuple of tensors, the first dimension
                     of a tensor corresponds to the batch dimension.
@@ -40,7 +40,7 @@ def influence(self, inputs: Any = None, **kwargs: Any) -> Any:
                     implementation of `DataInfluence` abstract class.
 
         Returns:
-            influences (Any): We do not add restrictions on the return type for now,
+            influences (any): We do not add restrictions on the return type for now,
                     though this may change in the future.
         """
         pass
diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index 6e8c605f8d..785e34931f 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -325,7 +325,7 @@ def influence(  # type: ignore[override]
           opponent) on the test example.
 
         Args:
-            inputs (Any, optional): If not provided or `None`, the self influence mode
+            inputs (any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
                     mode. If the argument `unpack_inputs` is False, the
@@ -645,7 +645,7 @@ def influence(  # type: ignore[override]
           opponent) on the test example.
 
         Args:
-            inputs (Any, optional): If not provided or `None`, the self influence mode
+            inputs (any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
                     mode. If the argument `unpack_inputs` is False, the
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index a83a199da0..c5f684ab2f 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -224,7 +224,7 @@ def influence(  # type: ignore[override]
           opponent) on the test example.
 
         Args:
-            inputs (Any, optional): If not provided or `None`, the self influence mode
+            inputs (any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
                     mode. If the argument `unpack_inputs` is False, the
@@ -923,7 +923,7 @@ def influence(  # type: ignore[override]
         gradients in the last fully-connected layer, please use `TracInCPFast` instead.
 
         Args:
-            inputs (Any, optional): If not provided or `None`, the self influence mode
+            inputs (any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
                     mode. If the argument `unpack_inputs` is False, the
diff --git a/captum/metrics/_core/sensitivity.py b/captum/metrics/_core/sensitivity.py
index 77d87e6291..b03029cf10 100644
--- a/captum/metrics/_core/sensitivity.py
+++ b/captum/metrics/_core/sensitivity.py
@@ -166,7 +166,7 @@ def sensitivity_max(
                 `input batch size * n_perturb_samples`.
 
                 Default: None
-         **kwargs (Any, optional): Contains a list of arguments that are passed
+         **kwargs (any, optional): Contains a list of arguments that are passed
                 to `explanation_func` explanation function which in some cases
                 could be the `attribute` function of an attribution algorithm.
                 Any additional arguments that need be passed to the explanation
diff --git a/captum/robust/_core/metrics/min_param_perturbation.py b/captum/robust/_core/metrics/min_param_perturbation.py
index fe7391c3c3..055500fe51 100644
--- a/captum/robust/_core/metrics/min_param_perturbation.py
+++ b/captum/robust/_core/metrics/min_param_perturbation.py
@@ -355,7 +355,7 @@ def evaluate(
 
         Args:
 
-            inputs (Any): Input for which minimal perturbation
+            inputs (any): Input for which minimal perturbation
                     is computed. It can be provided as a tensor, tuple of tensors,
                     or any raw input type (e.g. PIL image or text string).
                     This input is provided directly as input to preproc function
@@ -402,7 +402,7 @@ def evaluate(
             Tuple of (perturbed_inputs, param_val) if successful
             else Tuple of (None, None)
 
-            - **perturbed inputs** (Any):
+            - **perturbed inputs** (any):
                    Perturbed input (output of attack) which results in incorrect
                    prediction.
             - param_val (int, float)

From f4ae3d4aa52dcf65f7367d7e29a43d97516e0284 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Tue, 19 Jul 2022 12:35:39 -0600
Subject: [PATCH 11/84] tuple of tuples -> tuple of tuple

---
 captum/attr/_core/occlusion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index 79dfe1a251..f54d4079af 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -71,7 +71,7 @@ def attribute(  # type: ignore
                             to the number of examples (aka batch size), and if
                             multiple input tensors are provided, the examples must
                             be aligned appropriately.
-                sliding_window_shapes (tuple or tuple of tuples): Shape of patch
+                sliding_window_shapes (tuple or tuple of tuple): Shape of patch
                             (hyperrectangle) to occlude each input. For a single
                             input tensor, this must be a tuple of length equal to the
                             number of dimensions of the input tensor - 1, defining
@@ -80,7 +80,7 @@ def attribute(  # type: ignore
                             this must be a tuple containing one tuple for each input
                             tensor defining the dimensions of the patch for that
                             input tensor, as described for the single tensor case.
-                strides (int or tuple or tuple of ints or tuple of tuples, optional):
+                strides (int or tuple or tuple of ints or tuple of tuple, optional):
                             This defines the step by which the occlusion hyperrectangle
                             should be shifted by in each direction for each iteration.
                             For a single tensor input, this can be either a single

From 4732c6a9f408804d6a95aea45b267ef36217ed10 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Tue, 19 Jul 2022 14:17:40 -0600
Subject: [PATCH 12/84] Fix docstring Sphinx warnings

---
 captum/influence/_core/tracincp_fast_rand_proj.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index c5f684ab2f..33f637b813 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -255,7 +255,7 @@ def influence(  # type: ignore[override]
             show_progress (bool, optional): For all modes, computation of results
                     requires "training dataset computations": computations for each
                     batch in the training dataset `influence_src_dataset`, which may
-                    take a long time. If `show_progress`is true, the progress of
+                    take a long time. If `show_progress` is true, the progress of
                     "training dataset computations" will be displayed. In particular,
                     the number of batches for which computations have been performed
                     will be displayed. It will try to use tqdm if available for
@@ -271,7 +271,7 @@ def influence(  # type: ignore[override]
               `influence_src_dataset`. The length of this tensor is the number of
               examples in `influence_src_dataset`, regardless of whether it is a
               Dataset or DataLoader.
-            - influence score mode: if this mode is run (`inputs is not None, `k` is
+            - influence score mode: if this mode is run (`inputs` is not None, `k` is
               None), returns a 2D tensor `influence_scores` of shape
               `(input_size, influence_src_dataset_size)`, where `input_size` is
               the number of examples in the test batch, and
@@ -955,7 +955,7 @@ def influence(  # type: ignore[override]
 
             The return value of this method depends on which mode is run.
 
-            - influence score mode: if this mode is run (`inputs is not None, `k` is
+            - influence score mode: if this mode is run (`inputs` is not None, `k` is
               None), returns a 2D tensor `influence_scores` of shape
               `(input_size, influence_src_dataset_size)`, where `input_size` is
               the number of examples in the test batch, and

From 2ffbbd4df3aa388853adf644e605406b8cc399e4 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Tue, 19 Jul 2022 14:24:02 -0600
Subject: [PATCH 13/84] dictionary -> dict

---
 captum/robust/_core/metrics/min_param_perturbation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/captum/robust/_core/metrics/min_param_perturbation.py b/captum/robust/_core/metrics/min_param_perturbation.py
index 055500fe51..0bb6dc1a43 100644
--- a/captum/robust/_core/metrics/min_param_perturbation.py
+++ b/captum/robust/_core/metrics/min_param_perturbation.py
@@ -392,9 +392,9 @@ def evaluate(
                     (or inputs itself if no preproc_fn is provided) must be a tensor
                     or tuple of tensors.
                     Default: 1
-            attack_kwargs (dictionary, optional): Optional dictionary of keyword
+            attack_kwargs (dict, optional): Optional dictionary of keyword
                     arguments provided to attack function
-            correct_fn_kwargs (dictionary, optional): Optional dictionary of keyword
+            correct_fn_kwargs (dict, optional): Optional dictionary of keyword
                     arguments provided to correct function
 
         Returns:

From d4bb345b68f38afe7c438cefdd6c73102c0e926a Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Wed, 20 Jul 2022 12:02:42 -0600
Subject: [PATCH 14/84] list(torch.nn.Module) -> list of torch.nn.Module

---
 captum/attr/_core/layer/layer_activation.py            | 2 +-
 captum/attr/_core/layer/layer_gradient_x_activation.py | 2 +-
 captum/attr/_core/layer/layer_lrp.py                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/captum/attr/_core/layer/layer_activation.py b/captum/attr/_core/layer/layer_activation.py
index d16555a37b..7dc7f64dfe 100644
--- a/captum/attr/_core/layer/layer_activation.py
+++ b/captum/attr/_core/layer/layer_activation.py
@@ -27,7 +27,7 @@ def __init__(
 
             forward_func (callable):  The forward function of the model or any
                           modification of it
-            layer (torch.nn.Module or list(torch.nn.Module)): Layer or layers
+            layer (torch.nn.Module or list of torch.nn.Module): Layer or layers
                           for which attributions are computed.
                           Output size of attribute matches this layer's input or
                           output dimensions, depending on whether we attribute to
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index 0e7b2f91cb..cba9f9558f 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -32,7 +32,7 @@ def __init__(
 
             forward_func (callable):  The forward function of the model or any
                         modification of it
-            layer (torch.nn.Module or list(torch.nn.Module)): Layer or layers
+            layer (torch.nn.Module or list of torch.nn.Module): Layer or layers
                           for which attributions are computed.
                           Output size of attribute matches this layer's input or
                           output dimensions, depending on whether we attribute to
diff --git a/captum/attr/_core/layer/layer_lrp.py b/captum/attr/_core/layer/layer_lrp.py
index e72bbbaddc..c8c6fc529c 100644
--- a/captum/attr/_core/layer/layer_lrp.py
+++ b/captum/attr/_core/layer/layer_lrp.py
@@ -51,7 +51,7 @@ def __init__(self, model: Module, layer: ModuleOrModuleList) -> None:
                         PyTorch API starting from PyTorch v1.9.
 
 
-            layer (torch.nn.Module or list(torch.nn.Module)): Layer or layers
+            layer (torch.nn.Module or list of torch.nn.Module): Layer or layers
                           for which attributions are computed.
                           The size and dimensionality of the attributions
                           corresponds to the size and dimensionality of the layer's

From f2ad85b63bd047e33890e0be3889937642878437 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Wed, 20 Jul 2022 17:53:19 -0600
Subject: [PATCH 15/84] Minor docstring improvements

---
 captum/attr/_core/layer/layer_integrated_gradients.py |  7 +++----
 captum/attr/_core/lrp.py                              | 10 +++++-----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index eeb5ffdc1f..38822c67d0 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -55,10 +55,9 @@ def __init__(
         Args:
             forward_func (callable):  The forward function of the model or any
                         modification of it
-            layer (ModuleOrModuleList):
-                        Layer or list of layers for which attributions are computed.
-                        For each layer the output size of the attribute matches
-                        this layer's input or output dimensions, depending on
+            layer (ModuleOrModuleList): Layer or list of layers for which attributions
+                        are computed. For each layer the output size of the attribute
+                        matches this layer's input or output dimensions, depending on
                         whether we attribute to the inputs or outputs of the
                         layer, corresponding to the attribution of each neuron
                         in the input or output of this layer.
diff --git a/captum/attr/_core/lrp.py b/captum/attr/_core/lrp.py
index 77b4b825ca..32f2510ec2 100644
--- a/captum/attr/_core/lrp.py
+++ b/captum/attr/_core/lrp.py
@@ -108,11 +108,11 @@ def attribute(
                         are provided, the examples must be aligned appropriately.
 
             target (int, tuple, tensor or list, optional):  Output indices for
-                        which gradients are computed (for classification cases,
-                        this is usually the target class).
-                        If the network returns a scalar value per example,
-                        no target index is necessary.
-                        For general 2D outputs, targets can be either:
+                    which gradients are computed (for classification cases,
+                    this is usually the target class).
+                    If the network returns a scalar value per example,
+                    no target index is necessary.
+                    For general 2D outputs, targets can be either:
 
                     - a single integer or a tensor containing a single
                         integer, which is applied to all input examples

From eb756ab36c8005008a7f64c68452193cbb9eb451 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Wed, 20 Jul 2022 19:11:40 -0600
Subject: [PATCH 16/84] Resolve some more Sphinx warnings

---
 captum/attr/_core/deep_lift.py                |  2 ++
 captum/attr/_core/layer/internal_influence.py |  2 +-
 captum/attr/_core/layer/layer_deep_lift.py    |  2 ++
 .../_core/layer/layer_integrated_gradients.py |  2 +-
 captum/attr/_core/layer/layer_lrp.py          |  1 +
 captum/attr/_core/lrp.py                      |  1 +
 captum/concept/_core/tcav.py                  | 16 ++++++++--
 captum/influence/_core/tracincp.py            | 29 ++++++++++++-------
 .../_core/tracincp_fast_rand_proj.py          | 26 ++++++++++++-----
 9 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 251e68dc23..86549c4197 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -614,9 +614,11 @@ class DeepLiftShap(DeepLift):
     http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
 
     Note that the explanation model:
+
         1. Assumes that input features are independent of one another
         2. Is linear, meaning that the explanations are modeled through
             the additive composition of feature effects.
+
     Although, it assumes a linear model for each explanation, the overall
     model across multiple explanations can be complex and non-linear.
     """
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index 5f9fc0b603..3ecf1cbac1 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -86,7 +86,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalars or tensors, optional):
                         Baselines define a starting point from which integral
                         is computed and can be provided as:
 
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index dc036b6491..22dea7861f 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -385,9 +385,11 @@ class LayerDeepLiftShap(LayerDeepLift, DeepLiftShap):
     http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
 
     Note that the explanation model:
+
         1. Assumes that input features are independent of one another
         2. Is linear, meaning that the explanations are modeled through
             the additive composition of feature effects.
+
     Although, it assumes a linear model for each explanation, the overall
     model across multiple explanations can be complex and non-linear.
     """
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 38822c67d0..b7244eae1f 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -41,7 +41,6 @@ class LayerIntegratedGradients(LayerAttribution, GradientAttribution):
     More details regarding the integrated gradients method can be found in the
     original paper:
     https://arxiv.org/abs/1703.01365
-
     """
 
     def __init__(
@@ -53,6 +52,7 @@ def __init__(
     ) -> None:
         r"""
         Args:
+
             forward_func (callable):  The forward function of the model or any
                         modification of it
             layer (ModuleOrModuleList): Layer or list of layers for which attributions
diff --git a/captum/attr/_core/layer/layer_lrp.py b/captum/attr/_core/layer/layer_lrp.py
index c8c6fc529c..2ed7218083 100644
--- a/captum/attr/_core/layer/layer_lrp.py
+++ b/captum/attr/_core/layer/layer_lrp.py
@@ -198,6 +198,7 @@ def attribute(
                         If attributions for all layers are returned (layer=None) a list
                         of tensors is returned with entries for
                         each layer.
+
         Examples::
 
                 >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
diff --git a/captum/attr/_core/lrp.py b/captum/attr/_core/lrp.py
index 32f2510ec2..7c51046df2 100644
--- a/captum/attr/_core/lrp.py
+++ b/captum/attr/_core/lrp.py
@@ -98,6 +98,7 @@ def attribute(
     ]:
         r"""
         Args:
+
             inputs (tensor or tuple of tensors):  Input for which relevance is
                         propagated. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index d347ecfead..dfe7e1374d 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -37,11 +37,13 @@ def __init__(self, datasets: List[AV.AVDataset], labels: List[int]) -> None:
         However, __get_item__ not only returns a batch of activation vectors,
         but also a batch of labels indicating which concept that batch of
         activation vectors is associated with.
+
         Args:
+
             datasets (list[Dataset]): The k-th element of datasets is a Dataset
                     representing activation vectors associated with the k-th
                     concept
-            labels (list[Int]): The k-th element of labels is the integer label
+            labels (list[int]): The k-th element of labels is the integer label
                     associated with the k-th concept
         """
         assert len(datasets) == len(
@@ -75,10 +77,11 @@ def __getitem__(self, i):
         indicating which concept the batch of activation vectors is associated
         with.
 
-        args:
+        Args:
+
             i (int): which (activation vector, label) batch in the dataset to
                     return
-        returns:
+        Returns:
             inputs (Tensor): i-th batch in Dataset (representing activation
                     vectors)
             labels (Tensor): labels of i-th batch in Dataset
@@ -113,6 +116,7 @@ def train_cav(
     Please see the TCAV class documentation for further information.
 
     Args:
+
         model_id (str): A unique identifier for the PyTorch model for which
                 we would like to load the layer activations and train a
                 model in order to compute CAVs.
@@ -251,6 +255,7 @@ def __init__(
     ) -> None:
         r"""
         Args:
+
             model (Module): An instance of pytorch model that is used to compute
                     layer activations and attributions.
             layers (str, list[str]): A list of layer name(s) that are
@@ -403,6 +408,7 @@ def load_cavs(
         of concepts and layer.
 
         Args:
+
             concepts (list[Concept]): A list of Concept objects for which we want
                     to load the CAV.
 
@@ -458,6 +464,7 @@ def compute_cavs(
         the argument.
 
         Args:
+
             experimental_sets (list[list[Concept]]): A list of lists of concept
                     instances for which the cavs will be computed.
             force_train (bool, optional): A flag that indicates whether to
@@ -469,6 +476,7 @@ def compute_cavs(
                     multi-processing, otherwise it will be performed sequentially
                     in a single process.
                     Default: None
+
         Returns:
             cavs (dict) : A mapping of concept ids and layers to CAV objects.
                     If CAVs for the concept_ids-layer pairs are present in the
@@ -569,6 +577,7 @@ def interpret(
         scores for specific predictions and CAV vectors.
 
         Args:
+
             inputs (tensor or tuple of tensors): Inputs for which predictions
                     are performed and attributions are computed.
                     If model takes a single tensor as
@@ -617,6 +626,7 @@ def interpret(
                     attribution algorithm's attribute method. This could be for
                     example `n_steps` in case of integrated gradients.
                     Default: None
+
         Returns:
             results (dict): A dictionary of sign and magnitude -based tcav scores
                     for each concept set per layer.
diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index 785e34931f..4d9d8e104c 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -103,6 +103,7 @@ def __init__(
     ) -> None:
         r"""
         Args:
+
             model (torch.nn.Module): An instance of pytorch model. This model should
                     define all of its layers as attributes of the model.
             influence_src_dataset (torch.utils.data.Dataset or torch.utils.DataLoader):
@@ -123,7 +124,7 @@ def __init__(
                     `influence_src_dataset` is a Dataset, `batch_size` should be large.
                     If `influence_src_dataset` was already a DataLoader to begin with,
                     it should have been constructed to have a large batch size.
-            checkpoints (str or List of str or Iterator): Either the directory of the
+            checkpoints (str or list of str or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
@@ -132,7 +133,7 @@ def __init__(
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
                     Default: _load_flexible_state_dict
-            layers (List of str or None, optional): A list of layer names for which
+            layers (list of str or None, optional): A list of layer names for which
                     gradients should be computed. If `layers` is None, gradients will
                     be computed for all layers. Otherwise, they will only be computed
                     for the layers specified in `layers`.
@@ -217,7 +218,8 @@ def _get_k_most_influential(
     ) -> KMostInfluentialResults:
         r"""
         Args:
-            inputs (Tuple of Any): A tuple that represents a batch of examples. It does
+
+            inputs (tuple of Any): A tuple that represents a batch of examples. It does
                     not represent labels, which are passed as `targets`.
             targets (tensor, optional): If computing influence scores on a loss
                     function, these are the labels corresponding to the batch `inputs`.
@@ -265,7 +267,8 @@ def _influence(
     ) -> Tensor:
         r"""
         Args:
-            inputs (Tuple of Any): A batch of examples. Does not represent labels,
+
+            inputs (tuple of Any): A batch of examples. Does not represent labels,
                     which are passed as `targets`. The assumption is that
                     `self.model(*inputs)` produces the predictions for the batch.
             targets (tensor, optional): If computing influence scores on a loss
@@ -325,6 +328,7 @@ def influence(  # type: ignore[override]
           opponent) on the test example.
 
         Args:
+
             inputs (any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
@@ -454,6 +458,7 @@ def __init__(
     ) -> None:
         r"""
         Args:
+
             model (torch.nn.Module): An instance of pytorch model. This model should
                     define all of its layers as attributes of the model.
             influence_src_dataset (torch.utils.data.Dataset or torch.utils.DataLoader):
@@ -474,7 +479,7 @@ def __init__(
                     `influence_src_dataset` is a Dataset, `batch_size` should be large.
                     If `influence_src_dataset` was already a DataLoader to begin with,
                     it should have been constructed to have a large batch size.
-            checkpoints (str or List of str or Iterator): Either the directory of the
+            checkpoints (str or list of str or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
@@ -483,7 +488,7 @@ def __init__(
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
                     Default: _load_flexible_state_dict
-            layers (List of str or None, optional): A list of layer names for which
+            layers (list of str or None, optional): A list of layer names for which
                     gradients should be computed. If `layers` is None, gradients will
                     be computed for all layers. Otherwise, they will only be computed
                     for the layers specified in `layers`.
@@ -645,6 +650,7 @@ def influence(  # type: ignore[override]
           opponent) on the test example.
 
         Args:
+
             inputs (any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
@@ -776,7 +782,8 @@ def _influence(
         output of `self._basic_computation_tracincp`.
 
         Args:
-            inputs (Tuple of Any): A test batch of examples. Does not represent labels,
+
+            inputs (tuple of Any): A test batch of examples. Does not represent labels,
                     which are passed as `targets`. The assumption is that
                     `self.model(*inputs)` produces the predictions for the batch.
             targets (tensor, optional): If computing influence scores on a loss
@@ -831,9 +838,10 @@ def _get_k_most_influential(
     ) -> KMostInfluentialResults:
         r"""
         Args:
-            inputs (Tuple of Any): A tuple that represents a batch of examples. It does
+
+            inputs (tuple of Any): A tuple that represents a batch of examples. It does
                     not represent labels, which are passed as `targets`.
-            targets (Tensor, optional): If computing influence scores on a loss
+            targets (tensor, optional): If computing influence scores on a loss
                     function, these are the labels corresponding to the batch `inputs`.
                     Default: None
             k (int, optional): The number of proponents or opponents to return per test
@@ -984,7 +992,8 @@ def _basic_computation_tracincp(
         and batches.
 
         Args:
-            inputs (Tuple of Any): A batch of examples, which could be a training batch
+
+            inputs (tuple of Any): A batch of examples, which could be a training batch
                     or test batch, depending which method is the caller. Does not
                     represent labels, which are passed as `targets`. The assumption is
                     that `self.model(*inputs)` produces the predictions for the batch.
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index 33f637b813..3bf6caad04 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -86,6 +86,7 @@ def __init__(
     ) -> None:
         r"""
         Args:
+
             model (torch.nn.Module): An instance of pytorch model. This model should
                     define all of its layers as attributes of the model.
             final_fc_layer (torch.nn.Module or str): The last fully connected layer in
@@ -111,7 +112,7 @@ def __init__(
                     `influence_src_dataset` is a Dataset, `batch_size` should be large.
                     If `influence_src_dataset` was already a DataLoader to begin with,
                     it should have been constructed to have a large batch size.
-            checkpoints (str or List of str or Iterator): Either the directory of the
+            checkpoints (str or list of str or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
@@ -224,6 +225,7 @@ def influence(  # type: ignore[override]
           opponent) on the test example.
 
         Args:
+
             inputs (any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
@@ -358,7 +360,8 @@ def _influence(  # type: ignore[override]
         output of `_basic_computation_tracincp_fast`.
 
         Args:
-            inputs (Tuple of Any): A batch of examples. Does not represent labels,
+
+            inputs (tuple of Any): A batch of examples. Does not represent labels,
                     which are passed as `targets`. The assumption is that
                     `self.model(*inputs)` produces the predictions for the batch.
             targets (tensor): The labels corresponding to the batch `inputs`. This
@@ -415,7 +418,8 @@ def _get_k_most_influential(  # type: ignore[override]
     ) -> KMostInfluentialResults:
         r"""
         Args:
-            inputs (Tuple of Any): A tuple that represents a batch of examples. It does
+
+            inputs (tuple of Any): A tuple that represents a batch of examples. It does
                     not represent labels, which are passed as `targets`.
             targets (tensor): The labels corresponding to the batch `inputs`. This
                     method is designed to be applied for a loss function, so labels
@@ -555,13 +559,14 @@ def _basic_computation_tracincp_fast(
     and batches.
 
     Args:
+
         influence_instance (TracInCPFast): A instance of TracInCPFast or its children.
                 We assume `influence_instance` has a `loss_fn` attribute, i.e. the loss
                 function applied to the output of the last fully-connected layer, as
                 well as a `reduction_type` attribute, which indicates whether `loss_fn`
                 reduces the per-example losses by using their mean or sum. The
                 `reduction_type` attribute must either be "mean" or "sum".
-        inputs (Tuple of Any): A batch of examples, which could be a training batch
+        inputs (tuple of Any): A batch of examples, which could be a training batch
                 or test batch, depending which method is the caller. Does not
                 represent labels, which are passed as `targets`. The assumption is
                 that `self.model(*inputs)` produces the predictions for the batch.
@@ -632,7 +637,7 @@ def __init__(
         to obtain proponents / opponents or influence scores will be made in an
         "interactive" manner, and there is sufficient memory to store vectors for the
         entire `influence_src_dataset`. This is because in order to enable interactive
-        analysis, this implementation incures overhead in ``__init__` to setup the
+        analysis, this implementation incures overhead in `__init__` to setup the
         nearest-neighbors data structure, which is both time and memory intensive, as
         vectors corresponding to all training examples needed to be stored. To reduce
         memory usage, this implementation enables random projections of those vectors.
@@ -640,6 +645,7 @@ def __init__(
         accurate, though correct in expectation.
 
         Args:
+
             model (torch.nn.Module): An instance of pytorch model. This model should
                     define all of its layers as attributes of the model.
             final_fc_layer (torch.nn.Module or str): The last fully connected layer in
@@ -665,7 +671,7 @@ def __init__(
                     `influence_src_dataset` is a Dataset, `batch_size` should be large.
                     If `influence_src_dataset` was already a DataLoader to begin with,
                     it should have been constructed to have a large batch size.
-            checkpoints (str or List of str or Iterator): Either the directory of the
+            checkpoints (str or list of str or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
@@ -776,6 +782,7 @@ def _influence(  # type: ignore[override]
     ) -> Tensor:
         r"""
         Args:
+
             inputs (tuple of Any): A batch of examples. Does not represent labels,
                     which are passed as `targets`. The assumption is that
                     `self.model(*inputs)` produces the predictions for the batch.
@@ -813,7 +820,8 @@ def _get_k_most_influential(  # type: ignore[override]
     ) -> KMostInfluentialResults:
         r"""
         Args:
-            inputs (Tuple of Any): A tuple that represents a batch of examples. It does
+
+            inputs (tuple of Any): A tuple that represents a batch of examples. It does
                     not represent labels, which are passed as `targets`.
             targets (tensor): The labels corresponding to the batch `inputs`. This
                     method is designed to be applied for a loss function, so labels
@@ -923,6 +931,7 @@ def influence(  # type: ignore[override]
         gradients in the last fully-connected layer, please use `TracInCPFast` instead.
 
         Args:
+
             inputs (any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
@@ -1011,6 +1020,7 @@ def _set_projections_tracincp_fast_rand_proj(
         `TracInCPFastRandProj.__init__`.
 
         Args:
+
             dataloader (DataLoader): determining the projection requires knowing the
                     dimensionality of the last layer's parameters (`jacobian_dim`
                     below) and its input (`layer_input_dim` below). These are
@@ -1094,6 +1104,7 @@ def _process_src_intermediate_quantities_tracincp_fast_rand_proj(
         method creates that data structure. This method has side effects.
 
         Args:
+
             src_intermediate_quantities (tensor): the output of the
                     `_get_intermediate_quantities_tracin_fast_rand_proj` function when
                     applied to training dataset `influence_src_dataset`. This
@@ -1118,6 +1129,7 @@ def _get_intermediate_quantities_tracincp_fast_rand_proj(
         specifically, largest dot-product) data structure.
 
         Args:
+
             dataloader (DataLoader): DataLoader for which the intermediate quantities
                     are computed.
             projection_quantities (tuple or None): Is either the two tensors defining

From 46e4dd1aa6825887a5df3d40dbe0855490883e28 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Wed, 20 Jul 2022 19:21:18 -0600
Subject: [PATCH 17/84] Improve doc formatting

---
 captum/_utils/av.py                            |  2 +-
 captum/attr/_core/deep_lift.py                 | 12 ++++++------
 captum/attr/_core/feature_ablation.py          |  4 ++--
 captum/attr/_core/feature_permutation.py       |  4 ++--
 captum/attr/_core/gradient_shap.py             |  6 +++---
 captum/attr/_core/guided_backprop_deconvnet.py | 14 +++++++-------
 captum/attr/_core/guided_grad_cam.py           |  6 +++---
 captum/attr/_core/input_x_gradient.py          |  6 +++---
 captum/attr/_core/integrated_gradients.py      |  6 +++---
 captum/attr/_core/kernel_shap.py               |  6 +++---
 captum/attr/_core/layer/grad_cam.py            |  6 +++---
 captum/attr/_core/layer/internal_influence.py  |  6 +++---
 captum/attr/_core/layer/layer_activation.py    |  4 ++--
 captum/attr/_core/layer/layer_conductance.py   |  6 +++---
 captum/attr/_core/layer/layer_deep_lift.py     | 12 ++++++------
 .../attr/_core/layer/layer_feature_ablation.py |  6 +++---
 captum/attr/_core/layer/layer_gradient_shap.py | 10 +++++-----
 .../_core/layer/layer_gradient_x_activation.py |  6 +++---
 .../_core/layer/layer_integrated_gradients.py  |  6 +++---
 captum/attr/_core/layer/layer_lrp.py           |  4 ++--
 captum/attr/_core/lime.py                      | 18 +++++++++---------
 captum/attr/_core/lrp.py                       |  4 ++--
 captum/attr/_core/neuron/neuron_conductance.py |  6 +++---
 captum/attr/_core/neuron/neuron_deep_lift.py   |  8 ++++----
 .../_core/neuron/neuron_feature_ablation.py    |  4 ++--
 captum/attr/_core/neuron/neuron_gradient.py    |  4 ++--
 .../attr/_core/neuron/neuron_gradient_shap.py  |  6 +++---
 .../neuron/neuron_guided_backprop_deconvnet.py |  8 ++++----
 .../neuron/neuron_integrated_gradients.py      |  4 ++--
 captum/attr/_core/noise_tunnel.py              |  6 +++---
 captum/attr/_core/occlusion.py                 |  4 ++--
 captum/attr/_core/saliency.py                  |  4 ++--
 captum/attr/_core/shapley_value.py             | 10 +++++-----
 captum/attr/_utils/attribution.py              | 16 ++++++++--------
 captum/attr/_utils/visualization.py            |  6 +++---
 captum/concept/_core/cav.py                    |  2 +-
 captum/concept/_core/tcav.py                   |  2 +-
 captum/influence/_core/similarity_influence.py |  2 +-
 captum/metrics/_core/infidelity.py             |  2 +-
 captum/metrics/_core/sensitivity.py            |  2 +-
 40 files changed, 125 insertions(+), 125 deletions(-)

diff --git a/captum/_utils/av.py b/captum/_utils/av.py
index fa594abdaa..5250af7ea4 100644
--- a/captum/_utils/av.py
+++ b/captum/_utils/av.py
@@ -368,7 +368,7 @@ def _compute_and_save_activations(
                     different training batches.
             num_id (str): An required string representing the batch number for which the
                     activation vectors are computed
-            additional_forward_args (optional):  Additional arguments that will be
+            additional_forward_args (optional): Additional arguments that will be
                     passed to `model` after inputs.
                     Default: None
             load_from_disk (bool): Forces function to regenerate activations if False.
diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 86549c4197..15b1b65362 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -112,7 +112,7 @@ def __init__(
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                         contain any in-place nonlinear submodules; these are not
                         supported by the register_full_backward_hook PyTorch API
                         starting from PyTorch v1.9.
@@ -185,7 +185,7 @@ def attribute(  # type: ignore
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which
+            inputs (tensor or tuple of tensors): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -226,7 +226,7 @@ def attribute(  # type: ignore
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -627,7 +627,7 @@ def __init__(self, model: Module, multiply_by_inputs: bool = True) -> None:
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                         contain any in-place nonlinear submodules; these are not
                         supported by the register_full_backward_hook PyTorch API.
             multiply_by_inputs (bool, optional): Indicates whether to factor
@@ -696,7 +696,7 @@ def attribute(  # type: ignore
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which
+            inputs (tensor or tuple of tensors): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -730,7 +730,7 @@ def attribute(  # type: ignore
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index c3091e6612..34e983bb01 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -68,7 +68,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which ablation
+            inputs (tensor or tuple of tensors): Input for which ablation
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -105,7 +105,7 @@ def attribute(
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
index 8d77aef01d..abad41aff2 100644
--- a/captum/attr/_core/feature_permutation.py
+++ b/captum/attr/_core/feature_permutation.py
@@ -108,7 +108,7 @@ def attribute(  # type: ignore
 
 
         Args:
-                inputs (tensor or tuple of tensors):  Input for which
+                inputs (tensor or tuple of tensors): Input for which
                             permutation attributions are computed. If
                             forward_func takes a single tensor as input, a
                             single input tensor should be provided.  If
@@ -118,7 +118,7 @@ def attribute(  # type: ignore
                             0 corresponds to the number of examples (aka batch
                             size), and if multiple input tensors are provided,
                             the examples must be aligned appropriately.
-                target (int, tuple, tensor or list, optional):  Output indices for
+                target (int, tuple, tensor or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 484e2c8b64..84e32ab740 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -127,7 +127,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which SHAP attribution
+            inputs (tensor or tuple of tensors): Input for which SHAP attribution
                         values are computed. If `forward_func` takes a single
                         tensor as input, a single input tensor should be provided.
                         If `forward_func` takes multiple tensors as input, a tuple
@@ -158,7 +158,7 @@ def attribute(
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
-            n_samples (int, optional):  The number of randomly generated examples
+            n_samples (int, optional): The number of randomly generated examples
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
@@ -171,7 +171,7 @@ def attribute(
                         corresponds to the input with the same index in the inputs
                         tuple.
                         Default: 0.0
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/guided_backprop_deconvnet.py b/captum/attr/_core/guided_backprop_deconvnet.py
index e1953ed5b9..89b65ef1e2 100644
--- a/captum/attr/_core/guided_backprop_deconvnet.py
+++ b/captum/attr/_core/guided_backprop_deconvnet.py
@@ -27,7 +27,7 @@ def __init__(self, model: Module, use_relu_grad_output: bool = False) -> None:
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance.
+            model (nn.Module): The reference to PyTorch model instance.
         """
         GradientAttribution.__init__(self, model)
         self.model = model
@@ -121,7 +121,7 @@ def __init__(self, model: Module) -> None:
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                         contain any in-place ReLU submodules; these are not
                         supported by the register_full_backward_hook PyTorch API.
         """
@@ -139,7 +139,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which
+            inputs (tensor or tuple of tensors): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -148,7 +148,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -234,7 +234,7 @@ def __init__(self, model: Module) -> None:
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                         contain any in-place ReLU submodules; these are not
                         supported by the register_full_backward_hook PyTorch API.
         """
@@ -250,7 +250,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which
+            inputs (tensor or tuple of tensors): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -259,7 +259,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/guided_grad_cam.py b/captum/attr/_core/guided_grad_cam.py
index 1949f68666..91203983b5 100644
--- a/captum/attr/_core/guided_grad_cam.py
+++ b/captum/attr/_core/guided_grad_cam.py
@@ -51,7 +51,7 @@ def __init__(
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                         contain any in-place ReLU submodules; these are not
                         supported by the register_full_backward_hook PyTorch API
                         starting from PyTorch v1.9.
@@ -80,7 +80,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which attributions
+            inputs (tensor or tuple of tensors): Input for which attributions
                         are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -88,7 +88,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/input_x_gradient.py b/captum/attr/_core/input_x_gradient.py
index 7817466013..facfefa73e 100644
--- a/captum/attr/_core/input_x_gradient.py
+++ b/captum/attr/_core/input_x_gradient.py
@@ -22,7 +22,7 @@ def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                           modification of it
         """
         GradientAttribution.__init__(self, forward_func)
@@ -37,7 +37,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which
+            inputs (tensor or tuple of tensors): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -46,7 +46,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/integrated_gradients.py b/captum/attr/_core/integrated_gradients.py
index a23b9346ab..a7933e95b5 100644
--- a/captum/attr/_core/integrated_gradients.py
+++ b/captum/attr/_core/integrated_gradients.py
@@ -53,7 +53,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                     modification of it
             multiply_by_inputs (bool, optional): Indicates whether to factor
                     model inputs' multiplier in the final attribution scores.
@@ -130,7 +130,7 @@ def attribute(  # type: ignore
 
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which integrated
+            inputs (tensor or tuple of tensors): Input for which integrated
                         gradients are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -167,7 +167,7 @@ def attribute(  # type: ignore
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/kernel_shap.py b/captum/attr/_core/kernel_shap.py
index 1705a77770..85ff2f7f56 100644
--- a/captum/attr/_core/kernel_shap.py
+++ b/captum/attr/_core/kernel_shap.py
@@ -86,7 +86,7 @@ def attribute(  # type: ignore
 
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which KernelShap
+            inputs (tensor or tuple of tensors): Input for which KernelShap
                         is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -124,7 +124,7 @@ def attribute(  # type: ignore
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which surrogate model is trained
                         (for classification cases,
                         this is usually the target class).
@@ -185,7 +185,7 @@ def attribute(  # type: ignore
                         If None, then a feature mask is constructed which assigns
                         each scalar within a tensor as a separate feature.
                         Default: None
-            n_samples (int, optional):  The number of samples of the original
+            n_samples (int, optional): The number of samples of the original
                         model used to train the surrogate interpretable model.
                         Default: `50` if `n_samples` is not provided.
             perturbations_per_eval (int, optional): Allows multiple samples
diff --git a/captum/attr/_core/layer/grad_cam.py b/captum/attr/_core/layer/grad_cam.py
index 0820018a2b..e3e7d69a46 100644
--- a/captum/attr/_core/layer/grad_cam.py
+++ b/captum/attr/_core/layer/grad_cam.py
@@ -59,7 +59,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Output size of attribute matches this layer's output
@@ -86,7 +86,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which attributions
+            inputs (tensor or tuple of tensors): Input for which attributions
                         are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -94,7 +94,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index 3ecf1cbac1..ad39144351 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -46,7 +46,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Output size of attribute matches this layer's input or
@@ -78,7 +78,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which internal
+            inputs (tensor or tuple of tensors): Input for which internal
                         influence is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -115,7 +115,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_activation.py b/captum/attr/_core/layer/layer_activation.py
index 7dc7f64dfe..8d2ff2c7f4 100644
--- a/captum/attr/_core/layer/layer_activation.py
+++ b/captum/attr/_core/layer/layer_activation.py
@@ -25,7 +25,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module or list of torch.nn.Module): Layer or layers
                           for which attributions are computed.
@@ -54,7 +54,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which layer
+            inputs (tensor or tuple of tensors): Input for which layer
                         activation is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index 6662de5858..cb2a5d40ff 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -49,7 +49,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Output size of attribute matches this layer's input or
@@ -120,7 +120,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which layer
+            inputs (tensor or tuple of tensors): Input for which layer
                         conductance is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -157,7 +157,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index 22dea7861f..38860476ac 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -69,7 +69,7 @@ def __init__(
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                         contain any in-place nonlinear submodules; these are not
                         supported by the register_full_backward_hook PyTorch API
                         starting from PyTorch v1.9.
@@ -144,7 +144,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which layer
+            inputs (tensor or tuple of tensors): Input for which layer
                         attributions are computed. If forward_func takes a
                         single tensor as input, a single input tensor should be
                         provided. If forward_func takes multiple tensors as input,
@@ -185,7 +185,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -403,7 +403,7 @@ def __init__(
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                         contain any in-place nonlinear submodules; these are not
                         supported by the register_full_backward_hook PyTorch API
                         starting from PyTorch v1.9.
@@ -482,7 +482,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which layer
+            inputs (tensor or tuple of tensors): Input for which layer
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -516,7 +516,7 @@ def attribute(
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_feature_ablation.py b/captum/attr/_core/layer/layer_feature_ablation.py
index 42aad97598..3d7a071379 100644
--- a/captum/attr/_core/layer/layer_feature_ablation.py
+++ b/captum/attr/_core/layer/layer_feature_ablation.py
@@ -42,7 +42,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Output size of attribute matches this layer's input or
@@ -75,7 +75,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which layer
+            inputs (tensor or tuple of tensors): Input for which layer
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -94,7 +94,7 @@ def attribute(
                         In the cases when `baselines` is not provided, we internally
                         use zero as the baseline for each neuron.
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index a7e5033779..5f622e5b15 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -67,7 +67,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                         Output size of attribute matches this layer's input or
@@ -146,7 +146,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input which are used to compute
+            inputs (tensor or tuple of tensors): Input which are used to compute
                         SHAP attribution values for a given `layer`. If `forward_func`
                         takes a single tensor as input, a single input tensor should
                         be provided.
@@ -178,7 +178,7 @@ def attribute(
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
-            n_samples (int, optional):  The number of randomly generated examples
+            n_samples (int, optional): The number of randomly generated examples
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
@@ -191,7 +191,7 @@ def attribute(
                         corresponds to the input with the same index in the inputs
                         tuple.
                         Default: 0.0
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -335,7 +335,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                         Output size of attribute matches this layer's input or
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index cba9f9558f..66f432fd64 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -30,7 +30,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module or list of torch.nn.Module): Layer or layers
                           for which attributions are computed.
@@ -80,7 +80,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which attributions
+            inputs (tensor or tuple of tensors): Input for which attributions
                         are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -88,7 +88,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index b7244eae1f..04d1572504 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -53,7 +53,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                         modification of it
             layer (ModuleOrModuleList): Layer or list of layers for which attributions
                         are computed. For each layer the output size of the attribute
@@ -191,7 +191,7 @@ def attribute(
 
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which layer integrated
+            inputs (tensor or tuple of tensors): Input for which layer integrated
                         gradients are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -226,7 +226,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_lrp.py b/captum/attr/_core/layer/layer_lrp.py
index 2ed7218083..474f872c71 100644
--- a/captum/attr/_core/layer/layer_lrp.py
+++ b/captum/attr/_core/layer/layer_lrp.py
@@ -112,7 +112,7 @@ def attribute(
         r"""
 
         Args:
-            inputs (tensor or tuple of tensors):  Input for which relevance is
+            inputs (tensor or tuple of tensors): Input for which relevance is
                         propagated.
                         If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
@@ -121,7 +121,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/lime.py b/captum/attr/_core/lime.py
index 68f4ae386d..d868479996 100644
--- a/captum/attr/_core/lime.py
+++ b/captum/attr/_core/lime.py
@@ -82,7 +82,7 @@ def __init__(
         Args:
 
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                     modification of it. If a batch is provided as input for
                     attribution, it is expected that forward_func returns a scalar
                     representing the entire batch.
@@ -266,7 +266,7 @@ def attribute(
 
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which LIME
+            inputs (tensor or tuple of tensors): Input for which LIME
                         is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -274,7 +274,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which surrogate model is trained
                         (for classification cases,
                         this is usually the target class).
@@ -315,7 +315,7 @@ def attribute(
                         Note that attributions are not computed with respect
                         to these arguments.
                         Default: None
-            n_samples (int, optional):  The number of samples of the original
+            n_samples (int, optional): The number of samples of the original
                         model used to train the surrogate interpretable model.
                         Default: `50` if `n_samples` is not provided.
             perturbations_per_eval (int, optional): Allows multiple samples
@@ -603,7 +603,7 @@ def get_exp_kernel_similarity_function(
 
     Args:
 
-        distance_mode (str, optional):  Distance mode can be either "cosine" or
+        distance_mode (str, optional): Distance mode can be either "cosine" or
                     "euclidean" corresponding to either cosine distance
                     or Euclidean distance respectively. Distance is computed
                     by flattening the original inputs and perturbed inputs
@@ -732,7 +732,7 @@ def __init__(
         Args:
 
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                     modification of it
             interpretable_model (optional, Model): Model object to train
                     interpretable model.
@@ -879,7 +879,7 @@ def attribute(  # type: ignore
 
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which LIME
+            inputs (tensor or tuple of tensors): Input for which LIME
                         is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -917,7 +917,7 @@ def attribute(  # type: ignore
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which surrogate model is trained
                         (for classification cases,
                         this is usually the target class).
@@ -978,7 +978,7 @@ def attribute(  # type: ignore
                         If None, then a feature mask is constructed which assigns
                         each scalar within a tensor as a separate feature.
                         Default: None
-            n_samples (int, optional):  The number of samples of the original
+            n_samples (int, optional): The number of samples of the original
                         model used to train the surrogate interpretable model.
                         Default: `50` if `n_samples` is not provided.
             perturbations_per_eval (int, optional): Allows multiple samples
diff --git a/captum/attr/_core/lrp.py b/captum/attr/_core/lrp.py
index 7c51046df2..bcffd0304a 100644
--- a/captum/attr/_core/lrp.py
+++ b/captum/attr/_core/lrp.py
@@ -99,7 +99,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which relevance is
+            inputs (tensor or tuple of tensors): Input for which relevance is
                         propagated. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -108,7 +108,7 @@ def attribute(
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
 
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                     which gradients are computed (for classification cases,
                     this is usually the target class).
                     If the network returns a scalar value per example,
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index 9896b36d9d..97855d6c10 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -45,7 +45,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module): Layer for which neuron attributions are computed.
                         Attributions for a particular neuron in the input or output
@@ -103,7 +103,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which neuron
+            inputs (tensor or tuple of tensors): Input for which neuron
                         conductance is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -172,7 +172,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index 6096dd72ca..406ea087d9 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -46,7 +46,7 @@ def __init__(
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                         contain any in-place nonlinear submodules; these are not
                         supported by the register_full_backward_hook PyTorch API
                         starting from PyTorch v1.9.
@@ -90,7 +90,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which layer
+            inputs (tensor or tuple of tensors): Input for which layer
                         attributions are computed. If forward_func takes a
                         single tensor as input, a single input tensor should be
                         provided. If forward_func takes multiple tensors as input,
@@ -290,7 +290,7 @@ def __init__(
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                         contain any in-place nonlinear submodules; these are not
                         supported by the register_full_backward_hook PyTorch API
                         starting from PyTorch v1.9.
@@ -335,7 +335,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which layer
+            inputs (tensor or tuple of tensors): Input for which layer
                         attributions are computed. If forward_func takes a
                         single tensor as input, a single input tensor should be
                         provided. If forward_func takes multiple tensors as input,
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index f5e79a5c8a..8bf3c2e6b8 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -35,7 +35,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Attributions for a particular neuron in the input or output
@@ -67,7 +67,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which neuron
+            inputs (tensor or tuple of tensors): Input for which neuron
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index 6ce2b2acec..762346a41b 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -33,7 +33,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Output size of attribute matches this layer's input or
@@ -64,7 +64,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which neuron
+            inputs (tensor or tuple of tensors): Input for which neuron
                         gradients are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index 6eeb90bab9..632758b748 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -56,7 +56,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module): Layer for which neuron attributions are computed.
                         The output size of the attribute method matches the
@@ -106,7 +106,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which SHAP attribution
+            inputs (tensor or tuple of tensors): Input for which SHAP attribution
                         values are computed. If `forward_func` takes a single
                         tensor as input, a single input tensor should be provided.
                         If `forward_func` takes multiple tensors as input, a tuple
@@ -170,7 +170,7 @@ def attribute(
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
-            n_samples (int, optional):  The number of randomly generated examples
+            n_samples (int, optional): The number of randomly generated examples
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 7b15a5fdea..268055d9dc 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -35,7 +35,7 @@ def __init__(
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                           contain any in-place ReLU submodules; these are not
                           supported by the register_full_backward_hook PyTorch API
                           starting from PyTorch v1.9.
@@ -69,7 +69,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which
+            inputs (tensor or tuple of tensors): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -207,7 +207,7 @@ def __init__(
         r"""
         Args:
 
-            model (nn.Module):  The reference to PyTorch model instance. Model cannot
+            model (nn.Module): The reference to PyTorch model instance. Model cannot
                           contain any in-place ReLU submodules; these are not
                           supported by the register_full_backward_hook PyTorch API
                           starting from PyTorch v1.9.
@@ -238,7 +238,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which
+            inputs (tensor or tuple of tensors): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index 664b9d935c..e415afa111 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -33,7 +33,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable):  The forward function of the model or any
+            forward_func (callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                         Output size of attribute matches this layer's input or
@@ -84,7 +84,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which neuron integrated
+            inputs (tensor or tuple of tensors): Input for which neuron integrated
                         gradients are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 4830b3ad12..998e5eccd2 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -94,7 +94,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which integrated
+            inputs (tensor or tuple of tensors): Input for which integrated
                         gradients are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -105,11 +105,11 @@ def attribute(
             nt_type (str, optional): Smoothing type of the attributions.
                         `smoothgrad`, `smoothgrad_sq` or `vargrad`
                         Default: `smoothgrad` if `type` is not provided.
-            nt_samples (int, optional):  The number of randomly generated examples
+            nt_samples (int, optional): The number of randomly generated examples
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `nt_samples` is not provided.
-            nt_samples_batch_size (int, optional):  The number of the `nt_samples`
+            nt_samples_batch_size (int, optional): The number of the `nt_samples`
                         that will be processed together. With the help
                         of this parameter we can avoid out of memory situation and
                         reduce the number of randomly generated examples per sample
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index f54d4079af..060bea827a 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -62,7 +62,7 @@ def attribute(  # type: ignore
         r"""
         Args:
 
-                inputs (tensor or tuple of tensors):  Input for which occlusion
+                inputs (tensor or tuple of tensors): Input for which occlusion
                             attributions are computed. If forward_func takes a single
                             tensor as input, a single input tensor should be provided.
                             If forward_func takes multiple tensors as input, a tuple
@@ -128,7 +128,7 @@ def attribute(  # type: ignore
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
-                target (int, tuple, tensor or list, optional):  Output indices for
+                target (int, tuple, tensor or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
diff --git a/captum/attr/_core/saliency.py b/captum/attr/_core/saliency.py
index 3790bd2068..0cada6606a 100644
--- a/captum/attr/_core/saliency.py
+++ b/captum/attr/_core/saliency.py
@@ -43,7 +43,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors):  Input for which saliency
+            inputs (tensor or tuple of tensors): Input for which saliency
                         is computed. If forward_func takes a single tensor
                         as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -52,7 +52,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/shapley_value.py b/captum/attr/_core/shapley_value.py
index 622d0469f0..9090480145 100644
--- a/captum/attr/_core/shapley_value.py
+++ b/captum/attr/_core/shapley_value.py
@@ -96,7 +96,7 @@ def attribute(
 
         Args:
 
-                inputs (tensor or tuple of tensors):  Input for which Shapley value
+                inputs (tensor or tuple of tensors): Input for which Shapley value
                             sampling attributions are computed. If forward_func takes
                             a single tensor as input, a single input tensor should
                             be provided.
@@ -135,7 +135,7 @@ def attribute(
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
-                target (int, tuple, tensor or list, optional):  Output indices for
+                target (int, tuple, tensor or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
@@ -197,7 +197,7 @@ def attribute(
                             If None, then a feature mask is constructed which assigns
                             each scalar within a tensor as a separate feature
                             Default: None
-                n_samples (int, optional):  The number of feature permutations
+                n_samples (int, optional): The number of feature permutations
                             tested.
                             Default: `25` if `n_samples` is not provided.
                 perturbations_per_eval (int, optional): Allows multiple ablations
@@ -549,7 +549,7 @@ def attribute(
 
         Args:
 
-                inputs (tensor or tuple of tensors):  Input for which Shapley value
+                inputs (tensor or tuple of tensors): Input for which Shapley value
                             sampling attributions are computed. If forward_func takes
                             a single tensor as input, a single input tensor should
                             be provided.
@@ -588,7 +588,7 @@ def attribute(
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
-                target (int, tuple, tensor or list, optional):  Output indices for
+                target (int, tuple, tensor or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index 0eddbdf880..1cfbf7bb72 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -47,7 +47,7 @@ def __init__(self, forward_func: Callable) -> None:
 
     Args:
 
-        inputs (tensor or tuple of tensors):  Input for which attribution
+        inputs (tensor or tuple of tensors): Input for which attribution
                     is computed. It can be provided as a single tensor or
                     a tuple of multiple tensors. If multiple input tensors
                     are provided, the batch sizes must be aligned accross all
@@ -198,12 +198,12 @@ def compute_convergence_delta(
                             is the starting point of attributions' approximation.
                             It is assumed that both `start_point` and `end_point`
                             have the same shape and dimensionality.
-                end_point (tensor or tuple of tensors):  `end_point`
+                end_point (tensor or tuple of tensors): `end_point`
                             is passed as an input to model's forward function. It
                             is the end point of attributions' approximation.
                             It is assumed that both `start_point` and `end_point`
                             have the same shape and dimensionality.
-                target (int, tuple, tensor or list, optional):  Output indices for
+                target (int, tuple, tensor or list, optional): Output indices for
                             which gradients are computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
@@ -333,7 +333,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable or torch.nn.Module):  This can either be an instance
+            forward_func (callable or torch.nn.Module): This can either be an instance
                         of pytorch model or any modification of model's forward
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
@@ -366,7 +366,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable or torch.nn.Module):  This can either be an instance
+            forward_func (callable or torch.nn.Module): This can either be an instance
                         of pytorch model or any modification of model's forward
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
@@ -392,13 +392,13 @@ def interpolate(
 
         Args:
 
-            layer_attribution (torch.Tensor):  Tensor of given layer attributions.
+            layer_attribution (torch.Tensor): Tensor of given layer attributions.
             interpolate_dims (int or tuple): Upsampled dimensions. The
                         number of elements must be the number of dimensions
                         of layer_attribution - 2, since the first dimension
                         corresponds to number of examples and the second is
                         assumed to correspond to the number of channels.
-            interpolate_mode (str):  Method for interpolation, which
+            interpolate_mode (str): Method for interpolation, which
                         must be a valid input interpolation mode for
                         torch.nn.functional. These methods are
                         "nearest", "area", "linear" (3D-only), "bilinear"
@@ -436,7 +436,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable or torch.nn.Module):  This can either be an instance
+            forward_func (callable or torch.nn.Module): This can either be an instance
                         of pytorch model or any modification of model's forward
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
diff --git a/captum/attr/_utils/visualization.py b/captum/attr/_utils/visualization.py
index f8de5fa45a..b9e5f2ec9a 100644
--- a/captum/attr/_utils/visualization.py
+++ b/captum/attr/_utils/visualization.py
@@ -122,7 +122,7 @@ def visualize_image_attr(
                     visualized. Shape must be in the form (H, W, C), with
                     channels as last dimension. Shape must also match that of
                     the original image if provided.
-        original_image (numpy.array, optional):  Numpy array corresponding to
+        original_image (numpy.array, optional): Numpy array corresponding to
                     original image. Shape must be in the form (H, W, C), with
                     channels as the last dimension. Image can be provided either
                     with float values in range 0-1 or int values between 0-255.
@@ -353,7 +353,7 @@ def visualize_image_attr_multiple(
                     visualized. Shape must be in the form (H, W, C), with
                     channels as last dimension. Shape must also match that of
                     the original image if provided.
-        original_image (numpy.array, optional):  Numpy array corresponding to
+        original_image (numpy.array, optional): Numpy array corresponding to
                     original image. Shape must be in the form (H, W, C), with
                     channels as the last dimension. Image can be provided either
                     with values in range 0-1 or 0-255. This is a necessary
@@ -365,7 +365,7 @@ def visualize_image_attr_multiple(
         signs (list of str): List of strings of length k, defining signs for
                         each visualization. Each sign must be a valid
                         string argument for sign to visualize_image_attr.
-        titles (list of str, optional):  List of strings of length k, providing
+        titles (list of str, optional): List of strings of length k, providing
                     a title string for each plot. If None is provided, no titles
                     are added to subplots.
                     Default: None
diff --git a/captum/concept/_core/cav.py b/captum/concept/_core/cav.py
index 39aa9fba85..a1c0231bc6 100644
--- a/captum/concept/_core/cav.py
+++ b/captum/concept/_core/cav.py
@@ -146,7 +146,7 @@ def load(cavs_path: str, model_id: str, concepts: List[Concept], layer: str):
             model_id (str): A unique model identifier associated with the
                     CAVs. There exist a folder named `model_id` under
                     `cavs_path` path. The CAVs are loaded from this folder.
-            concepts (list[Concept]):  A List of concepts for which
+            concepts (list[Concept]): A List of concepts for which
                     we would like to load the cavs.
             layer (str): The layer name. Ex.: "inception4c". In case of nested
                     layers we use dots to specify the depth / hierarchy.
diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index dfe7e1374d..f352b6b67b 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -590,7 +590,7 @@ def interpret(
                     provided, the examples must be aligned appropriately.
             experimental_sets (list[list[Concept]]): A list of list of Concept
                     instances.
-            target (int, tuple, tensor or list, optional):  Output indices for
+            target (int, tuple, tensor or list, optional): Output indices for
                     which attributions are computed (for classification cases,
                     this is usually the target class).
                     If the network returns a scalar value per example,
diff --git a/captum/influence/_core/similarity_influence.py b/captum/influence/_core/similarity_influence.py
index 03251de37d..f3acce6a0d 100644
--- a/captum/influence/_core/similarity_influence.py
+++ b/captum/influence/_core/similarity_influence.py
@@ -172,7 +172,7 @@ def influence(  # type: ignore[override]
                     to the batch size. A tuple of tensors is only passed in if this
                     is the input form that `module` accepts.
             top_k (int): The number of top-matching activations to return
-            additional_forward_args (optional):  Additional arguments that will be
+            additional_forward_args (optional): Additional arguments that will be
                     passed to forward_func after inputs.
             load_src_from_disk (bool): Loads activations for `influence_src_dataset`
                     where possible. Setting to False would force regeneration of
diff --git a/captum/metrics/_core/infidelity.py b/captum/metrics/_core/infidelity.py
index e90e88ce09..ea1981101c 100644
--- a/captum/metrics/_core/infidelity.py
+++ b/captum/metrics/_core/infidelity.py
@@ -211,7 +211,7 @@ def infidelity(
                 input examples that are repeated `max_examples_per_batch / batch_size`
                 times within the batch.
 
-        inputs (tensor or tuple of tensors):  Input for which
+        inputs (tensor or tuple of tensors): Input for which
                 attributions are computed. If forward_func takes a single
                 tensor as input, a single input tensor should be provided.
                 If forward_func takes multiple tensors as input, a tuple
diff --git a/captum/metrics/_core/sensitivity.py b/captum/metrics/_core/sensitivity.py
index b03029cf10..7067149fdc 100644
--- a/captum/metrics/_core/sensitivity.py
+++ b/captum/metrics/_core/sensitivity.py
@@ -108,7 +108,7 @@ def sensitivity_max(
                 attribution algorithm or any other explanation method
                 that returns the explanations.
 
-        inputs (tensor or tuple of tensors):  Input for which
+        inputs (tensor or tuple of tensors): Input for which
                 explanations are computed. If `explanation_func` takes a
                 single tensor as input, a single input tensor should
                 be provided.

From fe159294de947334b4191ace16d01e50ea33c5a5 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Wed, 20 Jul 2022 19:44:33 -0600
Subject: [PATCH 18/84] Fix more Sphinx errors

---
 captum/insights/__init__.py     | 2 +-
 captum/insights/attr_vis/app.py | 4 ++--
 sphinx/source/insights.rst      | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/captum/insights/__init__.py b/captum/insights/__init__.py
index 48ba6fdfa0..2ba766cdd2 100644
--- a/captum/insights/__init__.py
+++ b/captum/insights/__init__.py
@@ -1 +1 @@
-from captum.insights.attr_vis import AttributionVisualizer, Batch  # noqa
+from captum.insights.attr_vis import AttributionVisualizer, Batch, features  # noqa
diff --git a/captum/insights/attr_vis/app.py b/captum/insights/attr_vis/app.py
index 9a0433090b..04c30da245 100644
--- a/captum/insights/attr_vis/app.py
+++ b/captum/insights/attr_vis/app.py
@@ -151,7 +151,7 @@ def __init__(
 
             models (torch.nn.module): One or more PyTorch modules (models) for
                           attribution visualization.
-            classes (list of string): List of strings corresponding to the names of
+            classes (list of str): List of strings corresponding to the names of
                           classes for classification.
             features (list of BaseFeature): List of BaseFeatures, which correspond
                           to input arguments to the model. Each feature object defines
@@ -175,7 +175,7 @@ def __init__(
                           are taken directly and assumed to correspond to the
                           class scores.
                           Default: None
-            use_label_for_attr (boolean, optional): If true, the class index is passed
+            use_label_for_attr (bool, optional): If true, the class index is passed
                           to the relevant attribution method. This is necessary in most
                           cases where there is an output neuron corresponding to each
                           class. When the model output is a scalar and class index
diff --git a/sphinx/source/insights.rst b/sphinx/source/insights.rst
index ece9180971..1e0963d483 100644
--- a/sphinx/source/insights.rst
+++ b/sphinx/source/insights.rst
@@ -4,12 +4,12 @@ Insights
 Batch
 ^^^^^
 
-.. autoclass:: captum.insights.api.Batch
+.. autoclass:: captum.insights.Batch
     :members:
 
 AttributionVisualizer
 ^^^^^^^^^^^^^^^^^^^^^
-.. autoclass:: captum.insights.api.AttributionVisualizer
+.. autoclass:: captum.insights.AttributionVisualizer
     :members:
 
 

From 0546b6c856ac9a0c98845616e28ae821fe941bc5 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Wed, 20 Jul 2022 20:12:37 -0600
Subject: [PATCH 19/84] Tensor -> tensor

---
 captum/concept/_core/tcav.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index f352b6b67b..3cb3de4b8b 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -82,9 +82,9 @@ def __getitem__(self, i):
             i (int): which (activation vector, label) batch in the dataset to
                     return
         Returns:
-            inputs (Tensor): i-th batch in Dataset (representing activation
+            inputs (tensor): i-th batch in Dataset (representing activation
                     vectors)
-            labels (Tensor): labels of i-th batch in Dataset
+            labels (tensor): labels of i-th batch in Dataset
         """
         assert i < self.length
         k = self._i_to_k(i)

From 4c9d6e786a34111643605ba80c2d9c1975f0684d Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 09:13:12 -0600
Subject: [PATCH 20/84] Fix docstring type formatting

---
 captum/concept/_core/cav.py                   |  6 ++---
 captum/concept/_core/tcav.py                  | 22 +++++++++----------
 captum/concept/_utils/common.py               |  2 +-
 captum/insights/attr_vis/features.py          |  2 +-
 .../robust/_core/metrics/attack_comparator.py |  2 +-
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/captum/concept/_core/cav.py b/captum/concept/_core/cav.py
index a1c0231bc6..ab61c3f3d5 100644
--- a/captum/concept/_core/cav.py
+++ b/captum/concept/_core/cav.py
@@ -30,7 +30,7 @@ def __init__(
         and loads them from the disk (storage).
 
         Args:
-            concepts (list[Concept]): a List of Concept objects. Only their
+            concepts (list of Concept): a List of Concept objects. Only their
                         names will be saved and loaded.
             layer (str): The layer where concept activation vectors are
                         computed using a predefined classifier.
@@ -65,7 +65,7 @@ def assemble_save_path(
                     layer name.
             model_id (str): A unique model identifier associated with input
                     `layer` and `concepts`
-            concepts (list(Concept)): A list of concepts that are concatenated
+            concepts (list of Concept): A list of concepts that are concatenated
                     together and used as a concept key using their ids. These
                     concept ids are retrieved from TCAV s`Concept` objects.
             layer (str): The name of the layer for which the activations are
@@ -146,7 +146,7 @@ def load(cavs_path: str, model_id: str, concepts: List[Concept], layer: str):
             model_id (str): A unique model identifier associated with the
                     CAVs. There exist a folder named `model_id` under
                     `cavs_path` path. The CAVs are loaded from this folder.
-            concepts (list[Concept]): A List of concepts for which
+            concepts (list of Concept): A List of concepts for which
                     we would like to load the cavs.
             layer (str): The layer name. Ex.: "inception4c". In case of nested
                     layers we use dots to specify the depth / hierarchy.
diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index 3cb3de4b8b..b8c546d72c 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -40,10 +40,10 @@ def __init__(self, datasets: List[AV.AVDataset], labels: List[int]) -> None:
 
         Args:
 
-            datasets (list[Dataset]): The k-th element of datasets is a Dataset
+            datasets (list of Dataset): The k-th element of datasets is a Dataset
                     representing activation vectors associated with the k-th
                     concept
-            labels (list[int]): The k-th element of labels is the integer label
+            labels (list of int): The k-th element of labels is the integer label
                     associated with the k-th concept
         """
         assert len(datasets) == len(
@@ -120,11 +120,11 @@ def train_cav(
         model_id (str): A unique identifier for the PyTorch model for which
                 we would like to load the layer activations and train a
                 model in order to compute CAVs.
-        concepts (list[Concept]): A list of Concept objects that are used
+        concepts (list of Concept): A list of Concept objects that are used
                 to train a classifier and learn decision boundaries between
                 those concepts for each layer defined in the `layers`
                 argument.
-        layers (str, list[str]): A list of layer names or a single layer
+        layers (str, list of str): A list of layer names or a single layer
                 name that is used to compute the activations of all concept
                 examples per concept and train a classifier using those
                 activations.
@@ -258,7 +258,7 @@ def __init__(
 
             model (Module): An instance of pytorch model that is used to compute
                     layer activations and attributions.
-            layers (str, list[str]): A list of layer name(s) that are
+            layers (str, list of str): A list of layer name(s) that are
                     used for computing concept activations (cavs) and layer
                     attributions.
             model_id (str, optional): A unique identifier for the PyTorch `model`
@@ -347,7 +347,7 @@ def generate_activation(self, layers: Union[str, List], concept: Concept) -> Non
         the list of layer(s) `layers`.
 
         Args:
-            layers (str, list[str]): A list of layer names or a layer name
+            layers (str, list of str): A list of layer names or a layer name
                     that is used to compute layer activations for the
                     specific `concept`.
             concept (Concept): A single Concept object that provides access
@@ -384,7 +384,7 @@ def generate_activations(self, concept_layers: Dict[Concept, List[str]]) -> None
         `concept_layers` dictionary.
 
         Args:
-            concept_layers (dict[Concept, list[str]]): Dictionay that maps
+            concept_layers (dict[Concept, list of str]): Dictionay that maps
                     Concept objects to a list of layer names to generate
                     the activations. Ex.: concept_layers =
                     {"striped": ['inception4c', 'inception4d']}
@@ -409,11 +409,11 @@ def load_cavs(
 
         Args:
 
-            concepts (list[Concept]): A list of Concept objects for which we want
+            concepts (list of Concept): A list of Concept objects for which we want
                     to load the CAV.
 
         Returns:
-            layers (list[layer]): A list of layers for which some CAVs still need
+            layers (list of layer): A list of layers for which some CAVs still need
                     to be computed.
             concept_layers (dict[concept, layer]): A dictionay of concept-layers
                     mapping for which we need to perform CAV computation through
@@ -465,7 +465,7 @@ def compute_cavs(
 
         Args:
 
-            experimental_sets (list[list[Concept]]): A list of lists of concept
+            experimental_sets (list of list of Concept): A list of lists of concept
                     instances for which the cavs will be computed.
             force_train (bool, optional): A flag that indicates whether to
                     train the CAVs regardless of whether they are saved or not.
@@ -588,7 +588,7 @@ def interpret(
                     dimension 0 corresponds to the number of examples
                     (aka batch size), and if multiple input tensors are
                     provided, the examples must be aligned appropriately.
-            experimental_sets (list[list[Concept]]): A list of list of Concept
+            experimental_sets (list of list of Concept): A list of list of Concept
                     instances.
             target (int, tuple, tensor or list, optional): Output indices for
                     which attributions are computed (for classification cases,
diff --git a/captum/concept/_utils/common.py b/captum/concept/_utils/common.py
index 6161736509..dece946601 100644
--- a/captum/concept/_utils/common.py
+++ b/captum/concept/_utils/common.py
@@ -11,7 +11,7 @@ def concepts_to_str(concepts: List[Concept]) -> str:
     Example output: "striped-random_0-random_1"
 
     Args:
-        concepts (list[Concept]): a List of concept names to be
+        concepts (list of Concept): a List of concept names to be
                 concatenated and used as a concepts key. These concept
                 names are respective to the Concept objects used for
                 the classifier train.
diff --git a/captum/insights/attr_vis/features.py b/captum/insights/attr_vis/features.py
index 0986170758..5b1d431f61 100644
--- a/captum/insights/attr_vis/features.py
+++ b/captum/insights/attr_vis/features.py
@@ -239,7 +239,7 @@ def __init__(self, name: str, categories: List[str]) -> None:
         Args:
             name (str): The label of the specific feature. For example, an
                         ImageFeature's name can be "Photo".
-            categories (list[str]): Category labels for the general feature. The
+            categories (list of str): Category labels for the general feature. The
                         order and size should match the second dimension of the
                         ``data`` tensor parameter in ``visualize``.
         """
diff --git a/captum/robust/_core/metrics/attack_comparator.py b/captum/robust/_core/metrics/attack_comparator.py
index 57b03e8f18..890cc01720 100644
--- a/captum/robust/_core/metrics/attack_comparator.py
+++ b/captum/robust/_core/metrics/attack_comparator.py
@@ -133,7 +133,7 @@ def add_attack(
             attack_kwargs (dict): Additional arguments to be provided to given attack.
                 This should be provided as a dictionary of keyword arguments.
 
-            additional_attack_arg_names (list[str]): Any additional arguments for the
+            additional_attack_arg_names (list of str): Any additional arguments for the
                 attack which are specific to the particular input example or batch.
                 An example of this is target, which is necessary for some attacks such
                 as FGSM or PGD. These arguments are included if provided as a kwarg

From a4f16b31c749bba453887bfc9d3fd9c6bb8c909a Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 09:14:40 -0600
Subject: [PATCH 21/84] remove 's' from int & float types

---
 captum/attr/_core/gradient_shap.py                           | 2 +-
 captum/attr/_core/layer/layer_gradient_shap.py               | 2 +-
 captum/attr/_core/neuron/neuron_conductance.py               | 2 +-
 captum/attr/_core/neuron/neuron_deep_lift.py                 | 4 ++--
 captum/attr/_core/neuron/neuron_feature_ablation.py          | 2 +-
 captum/attr/_core/neuron/neuron_gradient.py                  | 2 +-
 captum/attr/_core/neuron/neuron_gradient_shap.py             | 4 ++--
 captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py | 4 ++--
 captum/attr/_core/neuron/neuron_integrated_gradients.py      | 2 +-
 captum/attr/_core/noise_tunnel.py                            | 2 +-
 captum/attr/_core/occlusion.py                               | 2 +-
 captum/robust/_core/metrics/attack_comparator.py             | 2 +-
 12 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 84e32ab740..55a7c0c194 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -162,7 +162,7 @@ def attribute(
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
-            stdevs    (float, or tuple of floats, optional): The standard deviation
+            stdevs    (float, or tuple of float, optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index 5f622e5b15..a5835d0568 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -182,7 +182,7 @@ def attribute(
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
-            stdevs    (float, or a tuple of floats optional): The standard deviation
+            stdevs    (float, or a tuple of float optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index 97855d6c10..c708a393b8 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -111,7 +111,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of ints or slices):
+            neuron_selector (int, callable, or tuple of int or slices):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index 406ea087d9..a1ef6466fe 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -99,7 +99,7 @@ def attribute(
                         corresponds to the number of examples (aka batch size),
                         and if multiple input tensors are provided, the examples
                         must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of ints or slices):
+            neuron_selector (int, callable, or tuple of int or slices):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -344,7 +344,7 @@ def attribute(
                         corresponds to the number of examples (aka batch size),
                         and if multiple input tensors are provided, the examples
                         must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of ints or slices):
+            neuron_selector (int, callable, or tuple of int or slices):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index 8bf3c2e6b8..63868a5092 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -75,7 +75,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of ints or slices):
+            neuron_selector (int, callable, or tuple of int or slices):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index 762346a41b..c319b5c6b2 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -72,7 +72,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of ints or slices):
+            neuron_selector (int, callable, or tuple of int or slices):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index 632758b748..484ed42711 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -114,7 +114,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of ints or slices):
+            neuron_selector (int, callable, or tuple of int or slices):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -174,7 +174,7 @@ def attribute(
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
-            stdevs    (float, or a tuple of floats optional): The standard deviation
+            stdevs    (float, or a tuple of float optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 268055d9dc..aeb3281b80 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -78,7 +78,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            neuron_selector (int, callable, or tuple of ints or slices):
+            neuron_selector (int, callable, or tuple of int or slices):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -247,7 +247,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            neuron_selector (int, callable, or tuple of ints or slices):
+            neuron_selector (int, callable, or tuple of int or slices):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index e415afa111..dcac7ebc42 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -92,7 +92,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of ints or slices):
+            neuron_selector (int, callable, or tuple of int or slices):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 998e5eccd2..eb72a7101c 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -116,7 +116,7 @@ def attribute(
                         in each batch.
                         Default: None if `nt_samples_batch_size` is not provided. In
                         this case all `nt_samples` will be processed together.
-            stdevs    (float, or a tuple of floats optional): The standard deviation
+            stdevs    (float, or a tuple of float optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index 060bea827a..0f953e98b7 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -80,7 +80,7 @@ def attribute(  # type: ignore
                             this must be a tuple containing one tuple for each input
                             tensor defining the dimensions of the patch for that
                             input tensor, as described for the single tensor case.
-                strides (int or tuple or tuple of ints or tuple of tuple, optional):
+                strides (int or tuple or tuple of int or tuple of tuple, optional):
                             This defines the step by which the occlusion hyperrectangle
                             should be shifted by in each direction for each iteration.
                             For a single tensor input, this can be either a single
diff --git a/captum/robust/_core/metrics/attack_comparator.py b/captum/robust/_core/metrics/attack_comparator.py
index 890cc01720..314a91b6aa 100644
--- a/captum/robust/_core/metrics/attack_comparator.py
+++ b/captum/robust/_core/metrics/attack_comparator.py
@@ -78,7 +78,7 @@ def __init__(
                 All kwargs provided to evaluate are provided to the metric function,
                 following the model output. A single metric can be returned as
                 a float or tensor, and multiple metrics should be returned as either
-                a tuple or named tuple of floats or tensors. For a tensor metric,
+                a tuple or named tuple of float or tensors. For a tensor metric,
                 the first dimension should match the batch size, corresponding to
                 metrics for each example. Tensor metrics are averaged over the first
                 dimension when aggregating multiple batch results.

From 81b157e55e7a2f442868b61f1caf8bf0eb2208c5 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 09:17:02 -0600
Subject: [PATCH 22/84] slices -> slice

---
 captum/attr/_core/neuron/neuron_conductance.py               | 2 +-
 captum/attr/_core/neuron/neuron_deep_lift.py                 | 4 ++--
 captum/attr/_core/neuron/neuron_feature_ablation.py          | 2 +-
 captum/attr/_core/neuron/neuron_gradient.py                  | 2 +-
 captum/attr/_core/neuron/neuron_gradient_shap.py             | 2 +-
 captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py | 4 ++--
 captum/attr/_core/neuron/neuron_integrated_gradients.py      | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index c708a393b8..0a3b43248e 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -111,7 +111,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slices):
+            neuron_selector (int, callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index a1ef6466fe..22776088b2 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -99,7 +99,7 @@ def attribute(
                         corresponds to the number of examples (aka batch size),
                         and if multiple input tensors are provided, the examples
                         must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slices):
+            neuron_selector (int, callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -344,7 +344,7 @@ def attribute(
                         corresponds to the number of examples (aka batch size),
                         and if multiple input tensors are provided, the examples
                         must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slices):
+            neuron_selector (int, callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index 63868a5092..f53afafacd 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -75,7 +75,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slices):
+            neuron_selector (int, callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index c319b5c6b2..93ad3ef52a 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -72,7 +72,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slices):
+            neuron_selector (int, callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index 484ed42711..0a965b08a6 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -114,7 +114,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slices):
+            neuron_selector (int, callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index aeb3281b80..ec2ca43146 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -78,7 +78,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slices):
+            neuron_selector (int, callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -247,7 +247,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slices):
+            neuron_selector (int, callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index dcac7ebc42..de2829aa89 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -92,7 +92,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slices):
+            neuron_selector (int, callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:

From abb3ee791010a2f86ee2edeb18c935fb34254def Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 09:24:10 -0600
Subject: [PATCH 23/84] numpy.array -> numpy.ndarray

numpy.array is an array creation function. ndarray is the type.
---
 captum/attr/_utils/visualization.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/captum/attr/_utils/visualization.py b/captum/attr/_utils/visualization.py
index b9e5f2ec9a..a4536fc759 100644
--- a/captum/attr/_utils/visualization.py
+++ b/captum/attr/_utils/visualization.py
@@ -118,11 +118,11 @@ def visualize_image_attr(
 
     Args:
 
-        attr (numpy.array): Numpy array corresponding to attributions to be
+        attr (numpy.ndarray): Numpy array corresponding to attributions to be
                     visualized. Shape must be in the form (H, W, C), with
                     channels as last dimension. Shape must also match that of
                     the original image if provided.
-        original_image (numpy.array, optional): Numpy array corresponding to
+        original_image (numpy.ndarray, optional): Numpy array corresponding to
                     original image. Shape must be in the form (H, W, C), with
                     channels as the last dimension. Image can be provided either
                     with float values in range 0-1 or int values between 0-255.
@@ -349,11 +349,11 @@ def visualize_image_attr_multiple(
 
     Args:
 
-        attr (numpy.array): Numpy array corresponding to attributions to be
+        attr (numpy.ndarray): Numpy array corresponding to attributions to be
                     visualized. Shape must be in the form (H, W, C), with
                     channels as last dimension. Shape must also match that of
                     the original image if provided.
-        original_image (numpy.array, optional): Numpy array corresponding to
+        original_image (numpy.ndarray, optional): Numpy array corresponding to
                     original image. Shape must be in the form (H, W, C), with
                     channels as the last dimension. Image can be provided either
                     with values in range 0-1 or 0-255. This is a necessary
@@ -462,15 +462,15 @@ def visualize_timeseries_attr(
 
     Args:
 
-        attr (numpy.array): Numpy array corresponding to attributions to be
+        attr (numpy.ndarray): Numpy array corresponding to attributions to be
                     visualized. Shape must be in the form (N, C) with channels
                     as last dimension, unless `channels_last` is set to True.
                     Shape must also match that of the timeseries data.
-        data (numpy.array): Numpy array corresponding to the original,
+        data (numpy.ndarray): Numpy array corresponding to the original,
                     equidistant timeseries data. Shape must be in the form
                     (N, C) with channels as last dimension, unless
                     `channels_last` is set to true.
-        x_values (numpy.array, optional): Numpy array corresponding to the
+        x_values (numpy.ndarray, optional): Numpy array corresponding to the
                     points on the x-axis. Shape must be in the form (N, ). If
                     not provided, integers from 0 to N-1 are used.
                     Default: None

From ba3a0b4b63ce82edc00900177cd2aa2a59c7376e Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 09:37:08 -0600
Subject: [PATCH 24/84] Fix more doc types

---
 captum/_utils/av.py               | 2 +-
 captum/_utils/gradient.py         | 4 ++--
 captum/influence/_utils/common.py | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/captum/_utils/av.py b/captum/_utils/av.py
index 5250af7ea4..ccc9a1763e 100644
--- a/captum/_utils/av.py
+++ b/captum/_utils/av.py
@@ -213,7 +213,7 @@ def save(
                     `layers` is provided.
             layers (str or list of str): The layer(s) for which the activation vectors
                     are computed.
-            act_tensors (Tensor or List of Tensor): A batch of activation vectors.
+            act_tensors (tensor or list of tensor): A batch of activation vectors.
                     This must match the dimension of `layers`.
             num_id (str): string representing the batch number for which the activation
                     vectors are computed
diff --git a/captum/_utils/gradient.py b/captum/_utils/gradient.py
index a15157d8d7..2b754f05e8 100644
--- a/captum/_utils/gradient.py
+++ b/captum/_utils/gradient.py
@@ -730,7 +730,7 @@ def _compute_jacobian_wrt_params(
                 but must behave as a library loss function would if `reduction='none'`.
 
     Returns:
-        grads (Tuple of Tensor): Returns the Jacobian for the minibatch as a
+        grads (tuple of tensor): Returns the Jacobian for the minibatch as a
                 tuple of gradients corresponding to the tuple of trainable parameters
                 returned by `model.parameters()`. Each object grads[i] references to the
                 gradients for the parameters in the i-th trainable layer of the model.
@@ -804,7 +804,7 @@ def _compute_jacobian_wrt_params_with_sample_wise_trick(
                 Defaults to 'sum'.
 
     Returns:
-        grads (Tuple of Tensor): Returns the Jacobian for the minibatch as a
+        grads (tuple of tensor): Returns the Jacobian for the minibatch as a
                 tuple of gradients corresponding to the tuple of trainable parameters
                 returned by `model.parameters()`. Each object grads[i] references to the
                 gradients for the parameters in the i-th trainable layer of the model.
diff --git a/captum/influence/_utils/common.py b/captum/influence/_utils/common.py
index 2b28670c82..3da00c9e48 100644
--- a/captum/influence/_utils/common.py
+++ b/captum/influence/_utils/common.py
@@ -209,9 +209,9 @@ def _get_k_most_influential_helper(
         influence_batch_fn (callable): A callable that will be called via
                 `influence_batch_fn(inputs, targets, batch)`, where `batch` is a batch
                 in the `influence_src_dataloader` argument.
-        inputs (Tuple of Any): A batch of examples. Does not represent labels,
+        inputs (tuple of Any): A batch of examples. Does not represent labels,
                 which are passed as `targets`.
-        targets (Tensor, optional): If computing TracIn scores on a loss function,
+        targets (tensor, optional): If computing TracIn scores on a loss function,
                 these are the labels corresponding to the batch `inputs`.
                 Default: None
         k (int, optional): The number of proponents or opponents to return per test

From d3f043172e2b12de3cf710f512bbb3fea6b6a60e Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 10:54:28 -0600
Subject: [PATCH 25/84] Don't link directly to arXiv PDF files

---
 captum/attr/_core/deep_lift.py                    | 2 +-
 captum/attr/_core/guided_grad_cam.py              | 2 +-
 captum/attr/_core/layer/grad_cam.py               | 2 +-
 captum/attr/_core/layer/internal_influence.py     | 2 +-
 captum/attr/_core/layer/layer_conductance.py      | 2 +-
 captum/attr/_core/layer/layer_deep_lift.py        | 2 +-
 captum/attr/_core/neuron/neuron_deep_lift.py      | 2 +-
 captum/attr/_core/noise_tunnel.py                 | 2 +-
 captum/attr/_core/saliency.py                     | 2 +-
 captum/attr/_core/shapley_value.py                | 6 +++---
 captum/concept/_core/cav.py                       | 2 +-
 captum/concept/_core/tcav.py                      | 2 +-
 captum/influence/_core/tracincp.py                | 2 +-
 captum/influence/_core/tracincp_fast_rand_proj.py | 8 ++++----
 captum/metrics/_core/infidelity.py                | 8 ++++----
 captum/metrics/_core/sensitivity.py               | 6 +++---
 captum/robust/_core/fgsm.py                       | 2 +-
 captum/robust/_core/pgd.py                        | 2 +-
 18 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 15b1b65362..96a89fba66 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -611,7 +611,7 @@ class DeepLiftShap(DeepLift):
     each baseline and averages resulting attributions.
     More details about the algorithm can be found here:
 
-    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
+    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions
 
     Note that the explanation model:
 
diff --git a/captum/attr/_core/guided_grad_cam.py b/captum/attr/_core/guided_grad_cam.py
index 91203983b5..5f33e08041 100644
--- a/captum/attr/_core/guided_grad_cam.py
+++ b/captum/attr/_core/guided_grad_cam.py
@@ -38,7 +38,7 @@ class GuidedGradCam(GradientAttribution):
 
     More details regarding GuidedGradCAM can be found in the original
     GradCAM paper here:
-    https://arxiv.org/pdf/1610.02391.pdf
+    https://arxiv.org/abs/1610.02391
 
     Warning: Ensure that all ReLU operations in the forward function of the
     given model are performed using a module (nn.module.ReLU).
diff --git a/captum/attr/_core/layer/grad_cam.py b/captum/attr/_core/layer/grad_cam.py
index e3e7d69a46..b9f8496da9 100644
--- a/captum/attr/_core/layer/grad_cam.py
+++ b/captum/attr/_core/layer/grad_cam.py
@@ -47,7 +47,7 @@ class LayerGradCam(LayerAttribution, GradientAttribution):
 
     More details regarding the GradCAM method can be found in the
     original paper here:
-    https://arxiv.org/pdf/1610.02391.pdf
+    https://arxiv.org/abs/1610.02391
     """
 
     def __init__(
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index ad39144351..668385c3ff 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -30,7 +30,7 @@ class InternalInfluence(LayerAttribution, GradientAttribution):
     given input.
     If no baseline is provided, the default baseline is the zero tensor.
     More details on this approach can be found here:
-    https://arxiv.org/pdf/1802.03788.pdf
+    https://arxiv.org/abs/1802.03788
 
     Note that this method is similar to applying integrated gradients and
     taking the layer as input, integrating the gradient of the layer with
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index cb2a5d40ff..988c2339b6 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -32,7 +32,7 @@ class LayerConductance(LayerAttribution, GradientAttribution):
 
     The details of the approach can be found here:
     https://arxiv.org/abs/1805.12233
-    https://arxiv.org/pdf/1807.09946.pdf
+    https://arxiv.org/abs/1807.09946
 
     Note that this provides the total conductance of each neuron in the
     layer's output. To obtain the breakdown of a neuron's conductance by input
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index 38860476ac..08e27c41e5 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -382,7 +382,7 @@ class LayerDeepLiftShap(LayerDeepLift, DeepLiftShap):
     input flag `attribute_to_layer_input`.
     More details about the algorithm can be found here:
 
-    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
+    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions
 
     Note that the explanation model:
 
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index 22776088b2..48dd963c6a 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -273,7 +273,7 @@ class NeuronDeepLiftShap(NeuronAttribution, GradientAttribution):
     by the input flag `attribute_to_layer_input`.
     More details about the algorithm can be found here:
 
-    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
+    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions
 
     Note that the explanation model:
         1. Assumes that input features are independent of one another
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index eb72a7101c..73cdfdfe4e 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -46,7 +46,7 @@ class NoiseTunnel(Attribution):
         https://arxiv.org/abs/1810.03292
         https://arxiv.org/abs/1810.03307
         https://arxiv.org/abs/1706.03825
-        https://arxiv.org/pdf/1806.10758
+        https://arxiv.org/abs/1806.10758
 
     This method currently also supports batches of multiple examples input,
     however it can be computationally expensive depending on the model,
diff --git a/captum/attr/_core/saliency.py b/captum/attr/_core/saliency.py
index 0cada6606a..44f747b831 100644
--- a/captum/attr/_core/saliency.py
+++ b/captum/attr/_core/saliency.py
@@ -20,7 +20,7 @@ class Saliency(GradientAttribution):
     the default, the absolute value of the gradients is returned.
 
     More details about the approach can be found in the following paper:
-        https://arxiv.org/pdf/1312.6034.pdf
+        https://arxiv.org/abs/1312.6034
     """
 
     def __init__(self, forward_func: Callable) -> None:
diff --git a/captum/attr/_core/shapley_value.py b/captum/attr/_core/shapley_value.py
index 9090480145..95e1691cb5 100644
--- a/captum/attr/_core/shapley_value.py
+++ b/captum/attr/_core/shapley_value.py
@@ -59,7 +59,7 @@ class ShapleyValueSampling(PerturbationAttribution):
 
     More details regarding Shapley Value sampling can be found in these papers:
     https://www.sciencedirect.com/science/article/pii/S0305054808000804
-    https://pdfs.semanticscholar.org/7715/bb1070691455d1fcfc6346ff458dbca77b2c.pdf
+    https://pdfs.semanticscholar.org/7715/bb1070691455d1fcfc6346ff458dbca77b2c
     """
 
     def __init__(self, forward_func: Callable) -> None:
@@ -504,9 +504,9 @@ class ShapleyValues(ShapleyValueSampling):
     the entire feature group.
 
     More details regarding Shapley Values can be found in these papers:
-    https://apps.dtic.mil/dtic/tr/fulltext/u2/604084.pdf
+    https://apps.dtic.mil/dtic/tr/fulltext/u2/604084
     https://www.sciencedirect.com/science/article/pii/S0305054808000804
-    https://pdfs.semanticscholar.org/7715/bb1070691455d1fcfc6346ff458dbca77b2c.pdf
+    https://pdfs.semanticscholar.org/7715/bb1070691455d1fcfc6346ff458dbca77b2c
 
     NOTE: The method implemented here is very computationally intensive, and
     should only be used with a very small number of features (e.g. < 7).
diff --git a/captum/concept/_core/cav.py b/captum/concept/_core/cav.py
index ab61c3f3d5..9ded9c4032 100644
--- a/captum/concept/_core/cav.py
+++ b/captum/concept/_core/cav.py
@@ -14,7 +14,7 @@ class CAV:
     boundary of a classifier which distinguishes between activation
     vectors produced by different concepts.
     More details can be found in the paper:
-        https://arxiv.org/pdf/1711.11279.pdf
+        https://arxiv.org/abs/1711.11279
     """
 
     def __init__(
diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index b8c546d72c..66d2ef4b96 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -207,7 +207,7 @@ class TCAV(ConceptInterpreter):
     This class implements ConceptInterpreter abstract class using an
     approach called Testing with Concept Activation Vectors (TCAVs),
     as described in the paper:
-    https://arxiv.org/pdf/1711.11279.pdf
+    https://arxiv.org/abs/1711.11279
 
     TCAV scores for a given layer, a list of concepts and input example
     are computed using the dot product between prediction's layer
diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index 4d9d8e104c..067bd85b9e 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -43,7 +43,7 @@
 
 Implements abstract DataInfluence class and provides implementation details for
 influence computation based on the logic provided in TracIn paper
-(https://arxiv.org/pdf/2002.08484.pdf).
+(https://arxiv.org/abs/2002.08484).
 
 The TracIn paper proposes an idealized notion of influence which can be represented by
 the total amount a training example reduces loss for a test example via a training
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index 3bf6caad04..17232bf111 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -39,7 +39,7 @@ def _capture_inputs(layer: Module, input: Tensor, output: Tensor) -> None:
 r"""
 Implements abstract DataInfluence class and also provides implementation details for
 influence computation based on the logic provided in TracIn paper
-(https://arxiv.org/pdf/2002.08484.pdf).
+(https://arxiv.org/abs/2002.08484).
 
 The TracIn paper proposes an idealized notion of influence which can be represented by
 the total amount a training example reduces loss for a test example via a training
@@ -722,7 +722,7 @@ def __init__(
                     int, and random projection will be performed to ensure that the
                     vector is of dimension no more than `projection_dim` * C.
                     `projection_dim` corresponds to the variable d in the top of page
-                    15 of the TracIn paper: https://arxiv.org/pdf/2002.08484.pdf.
+                    15 of the TracIn paper: https://arxiv.org/abs/2002.08484.
                     Default: None
             seed (int, optional): Because this implementation chooses a random
                     projection, its output is random. Setting this seed specifies the
@@ -1071,7 +1071,7 @@ def _set_projections_tracincp_fast_rand_proj(
             # allowable dimension of the "partial" intermediate quantity. Therefore,
             # we only project if `jacobian_dim` * `layer_input_dim` > `projection_dim`.
             # `projection_dim` corresponds to the variable d in the top of page 15 of
-            # the TracIn paper: https://arxiv.org/pdf/2002.08484.pdf.
+            # the TracIn paper: https://arxiv.org/abs/2002.08484.
             if jacobian_dim * layer_input_dim > projection_dim:
                 jacobian_projection_dim = min(int(projection_dim**0.5), jacobian_dim)
                 layer_input_projection_dim = min(
@@ -1153,7 +1153,7 @@ def _get_intermediate_quantities_tracincp_fast_rand_proj(
                     performed to ensure that the vector is of dimension no more than
                     `self.projection_dim` * C. `self.projection_dim` corresponds to
                     the variable d in the top of page 15 of the TracIn paper:
-                    https://arxiv.org/pdf/2002.08484.pdf.
+                    https://arxiv.org/abs/2002.08484.
         """
         checkpoint_projections: List[Any] = [[] for _ in self.checkpoints]
 
diff --git a/captum/metrics/_core/infidelity.py b/captum/metrics/_core/infidelity.py
index ea1981101c..2536b301d3 100644
--- a/captum/metrics/_core/infidelity.py
+++ b/captum/metrics/_core/infidelity.py
@@ -126,7 +126,7 @@ def infidelity(
     and the differences between the predictor function at its input
     and perturbed input.
     More details about the measure can be found in the following paper:
-    https://arxiv.org/pdf/1901.09392.pdf
+    https://arxiv.org/abs/1901.09392
 
     It is derived from the completeness property of well-known attribution
     algorithms and is a computationally more efficient and generalized
@@ -134,7 +134,7 @@ def infidelity(
     of the attributions and the differences of the predictor function at
     its input and fixed baseline. More details about the Sensitivity-n can
     be found here:
-    https://arxiv.org/pdf/1711.06104.pdfs
+    https://arxiv.org/abs/1711.06104s
 
     The users can perturb the inputs any desired way by providing any
     perturbation function that takes the inputs (and optionally baselines)
@@ -256,7 +256,7 @@ def infidelity(
                 provided in the `captum.attr` package. Some of those attribution
                 approaches are so called global methods, which means that
                 they factor in model inputs' multiplier, as described in:
-                https://arxiv.org/pdf/1711.06104.pdf
+                https://arxiv.org/abs/1711.06104
                 Many global attribution algorithms can be used in local modes,
                 meaning that the inputs multiplier isn't factored in the
                 attribution scores.
@@ -272,7 +272,7 @@ def infidelity(
 
                 For local attributions we can use real-valued perturbations
                 whereas for global attributions that perturbation is binary.
-                https://arxiv.org/pdf/1901.09392.pdf
+                https://arxiv.org/abs/1901.09392
 
                 If we want to compute the infidelity of global attributions we
                 can use a binary perturbation matrix that will allow us to select
diff --git a/captum/metrics/_core/sensitivity.py b/captum/metrics/_core/sensitivity.py
index 7067149fdc..4630d02340 100644
--- a/captum/metrics/_core/sensitivity.py
+++ b/captum/metrics/_core/sensitivity.py
@@ -90,16 +90,16 @@ def sensitivity_max(
 
     More about the Lipschitz Continuity Metric can also be found here
     `On the Robustness of Interpretability Methods`
-    https://arxiv.org/pdf/1806.08049.pdf
+    https://arxiv.org/abs/1806.08049
     and
     `Towards Robust Interpretability with Self-Explaining Neural Networks`
     https://papers.nips.cc/paper\
     8003-towards-robust-interpretability-
-    with-self-explaining-neural-networks.pdf
+    with-self-explaining-neural-networks
 
     More details about sensitivity max can be found here:
     `On the (In)fidelity and Sensitivity of Explanations`
-    https://arxiv.org/pdf/1901.09392.pdf
+    https://arxiv.org/abs/1901.09392
 
     Args:
 
diff --git a/captum/robust/_core/fgsm.py b/captum/robust/_core/fgsm.py
index 5cbf6a0dae..f5de571bd9 100644
--- a/captum/robust/_core/fgsm.py
+++ b/captum/robust/_core/fgsm.py
@@ -31,7 +31,7 @@ class FGSM(Perturbation):
 
     More details on Fast Gradient Sign Method can be found in the original
     paper:
-    https://arxiv.org/pdf/1412.6572.pdf
+    https://arxiv.org/abs/1412.6572
     """
 
     def __init__(
diff --git a/captum/robust/_core/pgd.py b/captum/robust/_core/pgd.py
index 151edf370c..56dfa34038 100644
--- a/captum/robust/_core/pgd.py
+++ b/captum/robust/_core/pgd.py
@@ -32,7 +32,7 @@ class PGD(Perturbation):
 
     More details on Projected Gradient Descent can be found in the original
     paper:
-    https://arxiv.org/pdf/1706.06083.pdf
+    https://arxiv.org/abs/1706.06083
     """
 
     def __init__(

From 7530b25a089a9752687e368c5c7db52893693345 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 10:59:47 -0600
Subject: [PATCH 26/84] http -> https

---
 captum/attr/_core/deep_lift.py               | 2 +-
 captum/attr/_core/layer/layer_deep_lift.py   | 2 +-
 captum/attr/_core/neuron/neuron_deep_lift.py | 2 +-
 captum/attr/_core/shapley_value.py           | 6 +++---
 captum/metrics/_core/sensitivity.py          | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 96a89fba66..51dce4fb56 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -611,7 +611,7 @@ class DeepLiftShap(DeepLift):
     each baseline and averages resulting attributions.
     More details about the algorithm can be found here:
 
-    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions
+    https://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
 
     Note that the explanation model:
 
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index 08e27c41e5..4618ef73bd 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -382,7 +382,7 @@ class LayerDeepLiftShap(LayerDeepLift, DeepLiftShap):
     input flag `attribute_to_layer_input`.
     More details about the algorithm can be found here:
 
-    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions
+    https://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
 
     Note that the explanation model:
 
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index 48dd963c6a..3f4c33226b 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -273,7 +273,7 @@ class NeuronDeepLiftShap(NeuronAttribution, GradientAttribution):
     by the input flag `attribute_to_layer_input`.
     More details about the algorithm can be found here:
 
-    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions
+    https://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
 
     Note that the explanation model:
         1. Assumes that input features are independent of one another
diff --git a/captum/attr/_core/shapley_value.py b/captum/attr/_core/shapley_value.py
index 95e1691cb5..9090480145 100644
--- a/captum/attr/_core/shapley_value.py
+++ b/captum/attr/_core/shapley_value.py
@@ -59,7 +59,7 @@ class ShapleyValueSampling(PerturbationAttribution):
 
     More details regarding Shapley Value sampling can be found in these papers:
     https://www.sciencedirect.com/science/article/pii/S0305054808000804
-    https://pdfs.semanticscholar.org/7715/bb1070691455d1fcfc6346ff458dbca77b2c
+    https://pdfs.semanticscholar.org/7715/bb1070691455d1fcfc6346ff458dbca77b2c.pdf
     """
 
     def __init__(self, forward_func: Callable) -> None:
@@ -504,9 +504,9 @@ class ShapleyValues(ShapleyValueSampling):
     the entire feature group.
 
     More details regarding Shapley Values can be found in these papers:
-    https://apps.dtic.mil/dtic/tr/fulltext/u2/604084
+    https://apps.dtic.mil/dtic/tr/fulltext/u2/604084.pdf
     https://www.sciencedirect.com/science/article/pii/S0305054808000804
-    https://pdfs.semanticscholar.org/7715/bb1070691455d1fcfc6346ff458dbca77b2c
+    https://pdfs.semanticscholar.org/7715/bb1070691455d1fcfc6346ff458dbca77b2c.pdf
 
     NOTE: The method implemented here is very computationally intensive, and
     should only be used with a very small number of features (e.g. < 7).
diff --git a/captum/metrics/_core/sensitivity.py b/captum/metrics/_core/sensitivity.py
index 4630d02340..1cff46a3c1 100644
--- a/captum/metrics/_core/sensitivity.py
+++ b/captum/metrics/_core/sensitivity.py
@@ -95,7 +95,7 @@ def sensitivity_max(
     `Towards Robust Interpretability with Self-Explaining Neural Networks`
     https://papers.nips.cc/paper\
     8003-towards-robust-interpretability-
-    with-self-explaining-neural-networks
+    with-self-explaining-neural-networks.pdf
 
     More details about sensitivity max can be found here:
     `On the (In)fidelity and Sensitivity of Explanations`

From dcf363a5e3bdd500dc82be575039547b3a9f72b4 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 11:06:34 -0600
Subject: [PATCH 27/84] Fix minor issues

---
 captum/metrics/_core/infidelity.py               | 2 +-
 captum/robust/_core/metrics/attack_comparator.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/captum/metrics/_core/infidelity.py b/captum/metrics/_core/infidelity.py
index 2536b301d3..ecbbcf9bfd 100644
--- a/captum/metrics/_core/infidelity.py
+++ b/captum/metrics/_core/infidelity.py
@@ -134,7 +134,7 @@ def infidelity(
     of the attributions and the differences of the predictor function at
     its input and fixed baseline. More details about the Sensitivity-n can
     be found here:
-    https://arxiv.org/abs/1711.06104s
+    https://arxiv.org/abs/1711.06104
 
     The users can perturb the inputs any desired way by providing any
     perturbation function that takes the inputs (and optionally baselines)
diff --git a/captum/robust/_core/metrics/attack_comparator.py b/captum/robust/_core/metrics/attack_comparator.py
index 314a91b6aa..890cc01720 100644
--- a/captum/robust/_core/metrics/attack_comparator.py
+++ b/captum/robust/_core/metrics/attack_comparator.py
@@ -78,7 +78,7 @@ def __init__(
                 All kwargs provided to evaluate are provided to the metric function,
                 following the model output. A single metric can be returned as
                 a float or tensor, and multiple metrics should be returned as either
-                a tuple or named tuple of float or tensors. For a tensor metric,
+                a tuple or named tuple of floats or tensors. For a tensor metric,
                 the first dimension should match the batch size, corresponding to
                 metrics for each example. Tensor metrics are averaged over the first
                 dimension when aggregating multiple batch results.

From 31e453b32b4fbe86ccdc82729dac38f3f952c124 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 11:09:58 -0600
Subject: [PATCH 28/84] http -> https

---
 captum/attr/_core/layer/layer_gradient_shap.py   | 2 +-
 captum/attr/_core/neuron/neuron_gradient_shap.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index a5835d0568..fe6128dc57 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -29,7 +29,7 @@ class LayerGradientShap(LayerAttribution, GradientAttribution):
     #deep-learning-example-with-gradientexplainer-tensorflowkeraspytorch-models
 
     A Unified Approach to Interpreting Model Predictions
-    http://papers.nips.cc/paper\
+    https://papers.nips.cc/paper\
     7062-a-unified-approach-to-interpreting-model-predictions
 
     GradientShap approximates SHAP values by computing the expectations of
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index 0a965b08a6..d521fe94dd 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -18,7 +18,7 @@ class NeuronGradientShap(NeuronAttribution, GradientAttribution):
     #deep-learning-example-with-gradientexplainer-tensorflowkeraspytorch-models
 
     A Unified Approach to Interpreting Model Predictions
-    http://papers.nips.cc/paper\
+    https://papers.nips.cc/paper\
     7062-a-unified-approach-to-interpreting-model-predictions
 
     GradientShap approximates SHAP values by computing the expectations of

From 0a091d8d8b68dccf3c19603ee2289f1681931b19 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 13:21:24 -0600
Subject: [PATCH 29/84] Capitalize Any & Callable in docstrings

* Capitalize Any & Callable in docstrings

* More doc fixes

* perturbation -> Perturbation
---
 captum/_utils/models/linear_model/model.py    |  2 +-
 captum/attr/_core/deep_lift.py                | 10 ++++----
 captum/attr/_core/feature_ablation.py         |  6 ++---
 captum/attr/_core/feature_permutation.py      |  8 +++----
 captum/attr/_core/gradient_shap.py            |  8 +++----
 .../attr/_core/guided_backprop_deconvnet.py   |  4 ++--
 captum/attr/_core/guided_grad_cam.py          |  2 +-
 captum/attr/_core/input_x_gradient.py         |  4 ++--
 captum/attr/_core/integrated_gradients.py     |  4 ++--
 captum/attr/_core/kernel_shap.py              |  4 ++--
 captum/attr/_core/layer/grad_cam.py           |  4 ++--
 captum/attr/_core/layer/internal_influence.py |  4 ++--
 captum/attr/_core/layer/layer_activation.py   |  4 ++--
 captum/attr/_core/layer/layer_conductance.py  |  4 ++--
 captum/attr/_core/layer/layer_deep_lift.py    | 10 ++++----
 .../_core/layer/layer_feature_ablation.py     |  4 ++--
 .../attr/_core/layer/layer_gradient_shap.py   |  8 +++----
 .../layer/layer_gradient_x_activation.py      |  4 ++--
 .../_core/layer/layer_integrated_gradients.py |  4 ++--
 captum/attr/_core/lime.py                     | 24 +++++++++----------
 .../attr/_core/neuron/neuron_conductance.py   |  6 ++---
 captum/attr/_core/neuron/neuron_deep_lift.py  | 14 +++++------
 .../_core/neuron/neuron_feature_ablation.py   |  6 ++---
 captum/attr/_core/neuron/neuron_gradient.py   |  6 ++---
 .../attr/_core/neuron/neuron_gradient_shap.py |  8 +++----
 .../neuron_guided_backprop_deconvnet.py       |  8 +++----
 .../neuron/neuron_integrated_gradients.py     |  6 ++---
 captum/attr/_core/noise_tunnel.py             |  2 +-
 captum/attr/_core/occlusion.py                |  4 ++--
 captum/attr/_core/saliency.py                 |  4 ++--
 captum/attr/_core/shapley_value.py            |  8 +++----
 captum/attr/_models/base.py                   |  8 +++----
 captum/attr/_utils/attribution.py             | 14 +++++------
 captum/attr/_utils/stat.py                    |  2 +-
 captum/attr/_utils/visualization.py           |  2 +-
 captum/concept/_core/tcav.py                  |  6 ++---
 captum/concept/_utils/data_iterator.py        |  2 +-
 captum/influence/_core/influence.py           |  4 ++--
 .../influence/_core/similarity_influence.py   |  2 +-
 captum/influence/_core/tracincp.py            | 12 +++++-----
 .../_core/tracincp_fast_rand_proj.py          | 12 +++++-----
 captum/influence/_utils/common.py             |  2 +-
 captum/insights/attr_vis/app.py               |  2 +-
 captum/insights/attr_vis/features.py          | 18 +++++++-------
 captum/metrics/_core/infidelity.py            | 10 ++++----
 captum/metrics/_core/sensitivity.py           |  6 ++---
 captum/metrics/_utils/batching.py             |  4 ++--
 captum/robust/_core/fgsm.py                   | 10 ++++----
 .../robust/_core/metrics/attack_comparator.py | 16 ++++++-------
 .../_core/metrics/min_param_perturbation.py   | 14 +++++------
 captum/robust/_core/pgd.py                    | 10 ++++----
 51 files changed, 175 insertions(+), 175 deletions(-)

diff --git a/captum/_utils/models/linear_model/model.py b/captum/_utils/models/linear_model/model.py
index bfffdbf38a..6b8623a560 100644
--- a/captum/_utils/models/linear_model/model.py
+++ b/captum/_utils/models/linear_model/model.py
@@ -20,7 +20,7 @@ def __init__(self, train_fn: Callable, **kwargs) -> None:
         Please note that this is an experimental feature.
 
         Args:
-            train_fn (callable)
+            train_fn (Callable)
                 The function to train with. See
                 `captum._utils.models.linear_model.train.sgd_train_linear_model`
                 and
diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 51dce4fb56..47b220358e 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -251,7 +251,7 @@ def attribute(  # type: ignore
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
@@ -267,7 +267,7 @@ def attribute(  # type: ignore
                         is set to True convergence delta will be returned in
                         a tuple following attributions.
                         Default: False
-            custom_attribution_func (callable, optional): A custom function for
+            custom_attribution_func (Callable, optional): A custom function for
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
@@ -705,7 +705,7 @@ def attribute(  # type: ignore
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            baselines (tensor, tuple of tensors, callable):
+            baselines (tensor, tuple of tensors, Callable):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -755,7 +755,7 @@ def attribute(  # type: ignore
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
@@ -771,7 +771,7 @@ def attribute(  # type: ignore
                         is set to True convergence delta will be returned in
                         a tuple following attributions.
                         Default: False
-            custom_attribution_func (callable, optional): A custom function for
+            custom_attribution_func (Callable, optional): A custom function for
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index 34e983bb01..11dd7bf89b 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -47,7 +47,7 @@ def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or
+            forward_func (Callable): The forward function of the model or
                         any modification of it
         """
         PerturbationAttribution.__init__(self, forward_func)
@@ -130,7 +130,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
@@ -187,7 +187,7 @@ def attribute(
                         (e.g. time estimation). Otherwise, it will fallback to
                         a simple output of progress.
                         Default: False
-            **kwargs (any, optional): Any additional arguments used by child
+            **kwargs (Any, optional): Any additional arguments used by child
                         classes of FeatureAblation (such as Occlusion) to construct
                         ablations. These arguments are ignored when using
                         FeatureAblation directly.
diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
index abad41aff2..78f1e3cf80 100644
--- a/captum/attr/_core/feature_permutation.py
+++ b/captum/attr/_core/feature_permutation.py
@@ -75,9 +75,9 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or
+            forward_func (Callable): The forward function of the model or
                 any modification of it
-            perm_func (callable, optional): A function that accepts a batch of
+            perm_func (Callable, optional): A function that accepts a batch of
                 inputs and a feature mask, and "permutes" the feature using
                 feature mask across the batch. This defaults to a function
                 which applies a random permutation, this argument only needs
@@ -143,7 +143,7 @@ def attribute(  # type: ignore
                               target for the corresponding example.
 
                             Default: None
-                additional_forward_args (any, optional): If the forward function
+                additional_forward_args (Any, optional): If the forward function
                             requires additional arguments other than the inputs for
                             which attributions should not be computed, this argument
                             can be provided. It must be either a single additional
@@ -195,7 +195,7 @@ def attribute(  # type: ignore
                             (e.g. time estimation). Otherwise, it will fallback to
                             a simple output of progress.
                             Default: False
-                **kwargs (any, optional): Any additional arguments used by child
+                **kwargs (Any, optional): Any additional arguments used by child
                             classes of FeatureAblation (such as Occlusion) to construct
                             ablations. These arguments are ignored when using
                             FeatureAblation directly.
diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 55a7c0c194..47bb7b3955 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -59,7 +59,7 @@ def __init__(self, forward_func: Callable, multiply_by_inputs: bool = True) -> N
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or
+            forward_func (Callable): The forward function of the model or
                        any modification of it.
             multiply_by_inputs (bool, optional): Indicates whether to factor
                     model inputs' multiplier in the final attribution scores.
@@ -135,7 +135,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (tensor, tuple of tensors, callable):
+            baselines (tensor, tuple of tensors, Callable):
                         Baselines define the starting point from which expectation
                         is computed and can be provided as:
 
@@ -196,7 +196,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It can contain a tuple of ND tensors or
@@ -294,7 +294,7 @@ def __init__(self, forward_func: Callable, multiply_by_inputs=True) -> None:
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or
+            forward_func (Callable): The forward function of the model or
                         any modification of it
             multiply_by_inputs (bool, optional): Indicates whether to factor
                         model inputs' multiplier in the final attribution scores.
diff --git a/captum/attr/_core/guided_backprop_deconvnet.py b/captum/attr/_core/guided_backprop_deconvnet.py
index 89b65ef1e2..8c89846a1a 100644
--- a/captum/attr/_core/guided_backprop_deconvnet.py
+++ b/captum/attr/_core/guided_backprop_deconvnet.py
@@ -173,7 +173,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
@@ -284,7 +284,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/guided_grad_cam.py b/captum/attr/_core/guided_grad_cam.py
index 5f33e08041..6fef60bc0f 100644
--- a/captum/attr/_core/guided_grad_cam.py
+++ b/captum/attr/_core/guided_grad_cam.py
@@ -113,7 +113,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/input_x_gradient.py b/captum/attr/_core/input_x_gradient.py
index facfefa73e..a69beccef5 100644
--- a/captum/attr/_core/input_x_gradient.py
+++ b/captum/attr/_core/input_x_gradient.py
@@ -22,7 +22,7 @@ def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                           modification of it
         """
         GradientAttribution.__init__(self, forward_func)
@@ -71,7 +71,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/integrated_gradients.py b/captum/attr/_core/integrated_gradients.py
index a7933e95b5..e800bb1b3c 100644
--- a/captum/attr/_core/integrated_gradients.py
+++ b/captum/attr/_core/integrated_gradients.py
@@ -53,7 +53,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                     modification of it
             multiply_by_inputs (bool, optional): Indicates whether to factor
                     model inputs' multiplier in the final attribution scores.
@@ -192,7 +192,7 @@ def attribute(  # type: ignore
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/kernel_shap.py b/captum/attr/_core/kernel_shap.py
index 85ff2f7f56..a1151e6f53 100644
--- a/captum/attr/_core/kernel_shap.py
+++ b/captum/attr/_core/kernel_shap.py
@@ -29,7 +29,7 @@ def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or
+            forward_func (Callable): The forward function of the model or
                         any modification of it
         """
         Lime.__init__(
@@ -150,7 +150,7 @@ def attribute(  # type: ignore
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/layer/grad_cam.py b/captum/attr/_core/layer/grad_cam.py
index b9f8496da9..cac8179a69 100644
--- a/captum/attr/_core/layer/grad_cam.py
+++ b/captum/attr/_core/layer/grad_cam.py
@@ -59,7 +59,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Output size of attribute matches this layer's output
@@ -119,7 +119,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index 668385c3ff..fac9ce1cfd 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -46,7 +46,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Output size of attribute matches this layer's input or
@@ -140,7 +140,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/layer/layer_activation.py b/captum/attr/_core/layer/layer_activation.py
index 8d2ff2c7f4..c3f565d6c7 100644
--- a/captum/attr/_core/layer/layer_activation.py
+++ b/captum/attr/_core/layer/layer_activation.py
@@ -25,7 +25,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module or list of torch.nn.Module): Layer or layers
                           for which attributions are computed.
@@ -62,7 +62,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index 988c2339b6..bbeeed5795 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -49,7 +49,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Output size of attribute matches this layer's input or
@@ -182,7 +182,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index 4618ef73bd..42a9e359c0 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -210,7 +210,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
@@ -237,7 +237,7 @@ def attribute(
                         attribute to the input or output, is a single tensor.
                         Support for multiple tensors will be added later.
                         Default: False
-            custom_attribution_func (callable, optional): A custom function for
+            custom_attribution_func (Callable, optional): A custom function for
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
@@ -491,7 +491,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            baselines (tensor, tuple of tensors, callable):
+            baselines (tensor, tuple of tensors, Callable):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -541,7 +541,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
@@ -567,7 +567,7 @@ def attribute(
                         outputs of internal layers are single tensors.
                         Support for multiple tensors will be added later.
                         Default: False
-            custom_attribution_func (callable, optional): A custom function for
+            custom_attribution_func (Callable, optional): A custom function for
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
diff --git a/captum/attr/_core/layer/layer_feature_ablation.py b/captum/attr/_core/layer/layer_feature_ablation.py
index 3d7a071379..fd083a4876 100644
--- a/captum/attr/_core/layer/layer_feature_ablation.py
+++ b/captum/attr/_core/layer/layer_feature_ablation.py
@@ -42,7 +42,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Output size of attribute matches this layer's input or
@@ -119,7 +119,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index fe6128dc57..54bbf6613c 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -67,7 +67,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                         Output size of attribute matches this layer's input or
@@ -155,7 +155,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (tensor, tuple of tensors, callable):
+            baselines (tensor, tuple of tensors, Callable):
                         Baselines define the starting point from which expectation
                         is computed and can be provided as:
 
@@ -216,7 +216,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It can contain a tuple of ND tensors or
@@ -335,7 +335,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                         Output size of attribute matches this layer's input or
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index 66f432fd64..dbf0d15992 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -30,7 +30,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module or list of torch.nn.Module): Layer or layers
                           for which attributions are computed.
@@ -113,7 +113,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 04d1572504..12a52cb16a 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -53,7 +53,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                         modification of it
             layer (ModuleOrModuleList): Layer or list of layers for which attributions
                         are computed. For each layer the output size of the attribute
@@ -251,7 +251,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/lime.py b/captum/attr/_core/lime.py
index d868479996..77150a3f94 100644
--- a/captum/attr/_core/lime.py
+++ b/captum/attr/_core/lime.py
@@ -82,7 +82,7 @@ def __init__(
         Args:
 
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                     modification of it. If a batch is provided as input for
                     attribution, it is expected that forward_func returns a scalar
                     representing the entire batch.
@@ -106,7 +106,7 @@ def __init__(
                     Note that calling fit multiple times should retrain the
                     interpretable model, each attribution call reuses
                     the same given interpretable model object.
-            similarity_func (callable): Function which takes a single sample
+            similarity_func (Callable): Function which takes a single sample
                     along with its corresponding interpretable representation
                     and returns the weight of the interpretable sample for
                     training interpretable model. Weight is generally
@@ -131,7 +131,7 @@ def __init__(
 
                     All kwargs passed to the attribute method are
                     provided as keyword arguments (kwargs) to this callable.
-            perturb_func (callable): Function which returns a single
+            perturb_func (Callable): Function which returns a single
                     sampled input, generally a perturbation of the original
                     input, which is used to train the interpretable surrogate
                     model. Function can return samples in either
@@ -171,7 +171,7 @@ def __init__(
                     input. Once sampled, inputs can be converted to / from
                     the interpretable representation with either
                     to_interp_rep_transform or from_interp_rep_transform.
-            from_interp_rep_transform (callable): Function which takes a
+            from_interp_rep_transform (Callable): Function which takes a
                     single sampled interpretable representation (tensor
                     of shape 1 x num_interp_features) and returns
                     the corresponding representation in the input space
@@ -194,7 +194,7 @@ def __init__(
                     All kwargs passed to the attribute method are
                     provided as keyword arguments (kwargs) to this callable.
 
-            to_interp_rep_transform (callable): Function which takes a
+            to_interp_rep_transform (Callable): Function which takes a
                     sample in the original input space and converts to
                     its interpretable representation (tensor
                     of shape 1 x num_interp_features).
@@ -300,7 +300,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
@@ -335,7 +335,7 @@ def attribute(
                         (e.g. time estimation). Otherwise, it will fallback to
                         a simple output of progress.
                         Default: False
-            **kwargs (any, optional): Any additional arguments necessary for
+            **kwargs (Any, optional): Any additional arguments necessary for
                         sampling and transformation functions (provided to
                         constructor).
                         Default: None
@@ -732,9 +732,9 @@ def __init__(
         Args:
 
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                     modification of it
-            interpretable_model (optional, Model): Model object to train
+            interpretable_model (Model, optional): Model object to train
                     interpretable model.
 
                     This argument is optional and defaults to SkLearnLasso(alpha=0.01),
@@ -760,7 +760,7 @@ def __init__(
                     Note that calling fit multiple times should retrain the
                     interpretable model, each attribution call reuses
                     the same given interpretable model object.
-            similarity_func (optional, callable): Function which takes a single sample
+            similarity_func (Callable, optional): Function which takes a single sample
                     along with its corresponding interpretable representation
                     and returns the weight of the interpretable sample for
                     training the interpretable model.
@@ -793,7 +793,7 @@ def __init__(
 
                     kwargs includes baselines, feature_mask, num_interp_features
                     (integer, determined from feature mask).
-            perturb_func (optional, callable): Function which returns a single
+            perturb_func (Callable, optional): Function which returns a single
                     sampled input, which is a binary vector of length
                     num_interp_features, or a generator of such tensors.
 
@@ -943,7 +943,7 @@ def attribute(  # type: ignore
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index 0a3b43248e..66e59dc46c 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -45,7 +45,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module): Layer for which neuron attributions are computed.
                         Attributions for a particular neuron in the input or output
@@ -111,7 +111,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slice):
+            neuron_selector (int, Callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -197,7 +197,7 @@ def attribute(
                           target for the corresponding example.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index 3f4c33226b..43a1b96a27 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -99,7 +99,7 @@ def attribute(
                         corresponds to the number of examples (aka batch size),
                         and if multiple input tensors are provided, the examples
                         must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slice):
+            neuron_selector (int, Callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -165,7 +165,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
@@ -187,7 +187,7 @@ def attribute(
                         attribute to the input or output, is a single tensor.
                         Support for multiple tensors will be added later.
                         Default: False
-            custom_attribution_func (callable, optional): A custom function for
+            custom_attribution_func (Callable, optional): A custom function for
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
@@ -344,7 +344,7 @@ def attribute(
                         corresponds to the number of examples (aka batch size),
                         and if multiple input tensors are provided, the examples
                         must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slice):
+            neuron_selector (int, Callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -378,7 +378,7 @@ def attribute(
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
 
-            baselines (tensor, tuple of tensors, callable):
+            baselines (tensor, tuple of tensors, Callable):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -403,7 +403,7 @@ def attribute(
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
@@ -425,7 +425,7 @@ def attribute(
                         attribute to the input or output, is a single tensor.
                         Support for multiple tensors will be added later.
                         Default: False
-            custom_attribution_func (callable, optional): A custom function for
+            custom_attribution_func (Callable, optional): A custom function for
                         computing final attribution scores. This function can take
                         at least one and at most three arguments with the
                         following signature:
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index f53afafacd..61edd9f418 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -35,7 +35,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Attributions for a particular neuron in the input or output
@@ -75,7 +75,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slice):
+            neuron_selector (int, Callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -137,7 +137,7 @@ def attribute(
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index 93ad3ef52a..480bc4f6e1 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -33,7 +33,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                           modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                           Output size of attribute matches this layer's input or
@@ -72,7 +72,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slice):
+            neuron_selector (int, Callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -105,7 +105,7 @@ def attribute(
                           this function returns either a tensor with one element
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index d521fe94dd..ccf5524472 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -56,7 +56,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module): Layer for which neuron attributions are computed.
                         The output size of the attribute method matches the
@@ -114,7 +114,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slice):
+            neuron_selector (int, Callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -147,7 +147,7 @@ def attribute(
                           this function returns either a tensor with one element
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
-            baselines (tensor, tuple of tensors, callable):
+            baselines (tensor, tuple of tensors, Callable):
                         Baselines define the starting point from which expectation
                         is computed and can be provided as:
 
@@ -183,7 +183,7 @@ def attribute(
                         corresponds to the input with the same index in the inputs
                         tuple.
                         Default: 0.0
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It can contain a tuple of ND tensors or
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index ec2ca43146..1a96ce7497 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -78,7 +78,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slice):
+            neuron_selector (int, Callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -111,7 +111,7 @@ def attribute(
                           this function returns either a tensor with one element
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
@@ -247,7 +247,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slice):
+            neuron_selector (int, Callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -280,7 +280,7 @@ def attribute(
                           this function returns either a tensor with one element
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index de2829aa89..0c29f1943b 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -33,7 +33,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or any
+            forward_func (Callable): The forward function of the model or any
                         modification of it
             layer (torch.nn.Module): Layer for which attributions are computed.
                         Output size of attribute matches this layer's input or
@@ -92,7 +92,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, callable, or tuple of int or slice):
+            neuron_selector (int, Callable, or tuple of int or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -155,7 +155,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 73cdfdfe4e..b4dc561256 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -129,7 +129,7 @@ def attribute(
                         randomly draw baseline samples from the `baselines`
                         distribution provided as an input tensor.
                         Default: False
-            **kwargs (any, optional): Contains a list of arguments that are passed
+            **kwargs (Any, optional): Contains a list of arguments that are passed
                         to `attribution_method` attribution algorithm.
                         Any additional arguments that should be used for the
                         chosen attribution method should be included here.
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index 0f953e98b7..7db0157c8c 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -39,7 +39,7 @@ def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or
+            forward_func (Callable): The forward function of the model or
                         any modification of it
         """
         FeatureAblation.__init__(self, forward_func)
@@ -153,7 +153,7 @@ def attribute(  # type: ignore
                               target for the corresponding example.
 
                             Default: None
-                additional_forward_args (any, optional): If the forward function
+                additional_forward_args (Any, optional): If the forward function
                             requires additional arguments other than the inputs for
                             which attributions should not be computed, this argument
                             can be provided. It must be either a single additional
diff --git a/captum/attr/_core/saliency.py b/captum/attr/_core/saliency.py
index 44f747b831..8b784e7a9c 100644
--- a/captum/attr/_core/saliency.py
+++ b/captum/attr/_core/saliency.py
@@ -27,7 +27,7 @@ def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or
+            forward_func (Callable): The forward function of the model or
                         any modification of it
         """
         GradientAttribution.__init__(self, forward_func)
@@ -81,7 +81,7 @@ def attribute(
                         to True, otherwise returns the (signed) gradients if
                         False.
                         Default: True
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. It must be either a single additional
diff --git a/captum/attr/_core/shapley_value.py b/captum/attr/_core/shapley_value.py
index 9090480145..a0d980a54b 100644
--- a/captum/attr/_core/shapley_value.py
+++ b/captum/attr/_core/shapley_value.py
@@ -66,7 +66,7 @@ def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or
+            forward_func (Callable): The forward function of the model or
                         any modification of it. The forward function can either
                         return a scalar per example, or a single scalar for the
                         full batch. If a single scalar is returned for the batch,
@@ -160,7 +160,7 @@ def attribute(
                               target for the corresponding example.
 
                             Default: None
-                additional_forward_args (any, optional): If the forward function
+                additional_forward_args (Any, optional): If the forward function
                             requires additional arguments other than the inputs for
                             which attributions should not be computed, this argument
                             can be provided. It must be either a single additional
@@ -520,7 +520,7 @@ def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
 
-            forward_func (callable): The forward function of the model or
+            forward_func (Callable): The forward function of the model or
                         any modification of it. The forward function can either
                         return a scalar per example, or a single scalar for the
                         full batch. If a single scalar is returned for the batch,
@@ -613,7 +613,7 @@ def attribute(
                               target for the corresponding example.
 
                             Default: None
-                additional_forward_args (any, optional): If the forward function
+                additional_forward_args (Any, optional): If the forward function
                             requires additional arguments other than the inputs for
                             which attributions should not be computed, this argument
                             can be provided. It must be either a single additional
diff --git a/captum/attr/_models/base.py b/captum/attr/_models/base.py
index 2c24918567..25ed8469fb 100644
--- a/captum/attr/_models/base.py
+++ b/captum/attr/_models/base.py
@@ -35,7 +35,7 @@ def forward(self, *inputs, **kwargs):
 
         Args:
 
-           *inputs (any, optional): A sequence of inputs arguments that the
+           *inputs (Any, optional): A sequence of inputs arguments that the
                    forward function takes. Since forward functions can take any
                    type and number of arguments, this will ensure that we can
                    execute the forward pass using interpretable embedding layer.
@@ -43,7 +43,7 @@ def forward(self, *inputs, **kwargs):
                    argument is the embedding tensor generated using the
                    `self.embedding` layer using all input arguments provided in
                    `inputs` and `kwargs`.
-           **kwargs (any, optional): Similar to `inputs` we want to make sure
+           **kwargs (Any, optional): Similar to `inputs` we want to make sure
                    that our forward pass supports arbitrary number and type of
                    key-value arguments. If `inputs` is not provided, `kwargs` must
                    be provided and the first argument corresponds to the embedding
@@ -76,10 +76,10 @@ def indices_to_embeddings(self, *input, **kwargs):
 
         Args:
 
-            *input (any, Optional): This can be a tensor(s) of input indices or any
+            *input (Any, optional): This can be a tensor(s) of input indices or any
                     other variable necessary to comput the embeddings. A typical
                     example of input indices are word or token indices.
-            **kwargs (any, optional): Similar to `input` this can be any sequence
+            **kwargs (Any, optional): Similar to `input` this can be any sequence
                     of key-value arguments necessary to compute final embedding
                     tensor.
         Returns:
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index 1cfbf7bb72..2c66481870 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -31,7 +31,7 @@ class Attribution:
     def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
-            forward_func (callable or torch.nn.Module): This can either be an instance
+            forward_func (Callable or torch.nn.Module): This can either be an instance
                         of pytorch model or any modification of model's forward
                         function.
         """
@@ -150,7 +150,7 @@ def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
 
-            forward_func (callable or torch.nn.Module): This can either be an instance
+            forward_func (Callable or torch.nn.Module): This can either be an instance
                         of pytorch model or any modification of model's forward
                         function.
         """
@@ -228,7 +228,7 @@ def compute_convergence_delta(
                               target for the corresponding example.
 
                             Default: None
-                additional_forward_args (any, optional): If the forward function
+                additional_forward_args (Any, optional): If the forward function
                             requires additional arguments other than the inputs for
                             which attributions should not be computed, this argument
                             can be provided. It must be either a single additional
@@ -306,7 +306,7 @@ def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
 
-            forward_func (callable or torch.nn.Module): This can either be an instance
+            forward_func (Callable or torch.nn.Module): This can either be an instance
                         of pytorch model or any modification of model's forward
                         function.
         """
@@ -333,7 +333,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable or torch.nn.Module): This can either be an instance
+            forward_func (Callable or torch.nn.Module): This can either be an instance
                         of pytorch model or any modification of model's forward
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
@@ -366,7 +366,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable or torch.nn.Module): This can either be an instance
+            forward_func (Callable or torch.nn.Module): This can either be an instance
                         of pytorch model or any modification of model's forward
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
@@ -436,7 +436,7 @@ def __init__(
         r"""
         Args:
 
-            forward_func (callable or torch.nn.Module): This can either be an instance
+            forward_func (Callable or torch.nn.Module): This can either be an instance
                         of pytorch model or any modification of model's forward
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
diff --git a/captum/attr/_utils/stat.py b/captum/attr/_utils/stat.py
index 8c643f369b..803bbc7ab7 100644
--- a/captum/attr/_utils/stat.py
+++ b/captum/attr/_utils/stat.py
@@ -26,7 +26,7 @@ def __init__(self, name: Optional[str] = None, **kwargs: Any) -> None:
             name (str, optional):
                 The name of the statistic. If not provided,
                 the class name will be used alongside it's parameters
-            kwargs (any):
+            kwargs (Any):
                 Additional arguments used to construct the statistic
         """
         self.params = kwargs
diff --git a/captum/attr/_utils/visualization.py b/captum/attr/_utils/visualization.py
index a4536fc759..736244ba45 100644
--- a/captum/attr/_utils/visualization.py
+++ b/captum/attr/_utils/visualization.py
@@ -376,7 +376,7 @@ def visualize_image_attr_multiple(
                     uses Matplotlib object oriented API and simply returns a
                     figure object without showing.
                     Default: True.
-        **kwargs (any, optional): Any additional arguments which will be passed
+        **kwargs (Any, optional): Any additional arguments which will be passed
                     to every individual visualization. Such arguments include
                     `show_colorbar`, `alpha_overlay`, `cmap`, etc.
 
diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index 66d2ef4b96..245aa5816a 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -280,7 +280,7 @@ def __init__(
                     attribution algorithm.
             save_path (str, optional): The path for storing CAVs and
                     Activation Vectors (AVs).
-            classifier_kwargs (any, optional): Additional arguments such as
+            classifier_kwargs (Any, optional): Additional arguments such as
                     `test_split_ratio` that are passed to concept `classifier`.
 
         Examples::
@@ -612,7 +612,7 @@ def interpret(
                         #output_dims - 1 elements. Each tuple is applied as the
                         target for the corresponding example.
 
-            additional_forward_args (any, optional): Extra arguments that are passed to
+            additional_forward_args (Any, optional): Extra arguments that are passed to
                      model when computing the attributions for `inputs`
                      w.r.t. layer output.
                      Default: None
@@ -622,7 +622,7 @@ def interpret(
                     `processes`. Otherwise, CAV computations will be performed
                     sequential.
                     Default:None
-            **kwargs (any, optional): A list of arguments that are passed to layer
+            **kwargs (Any, optional): A list of arguments that are passed to layer
                     attribution algorithm's attribute method. This could be for
                     example `n_steps` in case of integrated gradients.
                     Default: None
diff --git a/captum/concept/_utils/data_iterator.py b/captum/concept/_utils/data_iterator.py
index 6a8a48f197..574bc71ae7 100644
--- a/captum/concept/_utils/data_iterator.py
+++ b/captum/concept/_utils/data_iterator.py
@@ -16,7 +16,7 @@ class CustomIterableDataset(IterableDataset):
     def __init__(self, transform_filename_to_tensor: Callable, path: str) -> None:
         r"""
         Args:
-            transform_filename_to_tensor (callable): Function to read a data
+            transform_filename_to_tensor (Callable): Function to read a data
                         file from path and return a tensor from that file.
             path (str): Path to dataset files. This can be either a path to a
                         directory or a file where input examples are stored.
diff --git a/captum/influence/_core/influence.py b/captum/influence/_core/influence.py
index b8e5eae357..f8ef1eb882 100644
--- a/captum/influence/_core/influence.py
+++ b/captum/influence/_core/influence.py
@@ -32,7 +32,7 @@ def __init_(
     def influence(self, inputs: Any = None, **kwargs: Any) -> Any:
         r"""
         Args:
-            inputs (any): Batch of examples for which influential
+            inputs (Any): Batch of examples for which influential
                     instances are computed. They are passed to the forward_func. If
                     `inputs` if a tensor or tuple of tensors, the first dimension
                     of a tensor corresponds to the batch dimension.
@@ -40,7 +40,7 @@ def influence(self, inputs: Any = None, **kwargs: Any) -> Any:
                     implementation of `DataInfluence` abstract class.
 
         Returns:
-            influences (any): We do not add restrictions on the return type for now,
+            influences (Any): We do not add restrictions on the return type for now,
                     though this may change in the future.
         """
         pass
diff --git a/captum/influence/_core/similarity_influence.py b/captum/influence/_core/similarity_influence.py
index f3acce6a0d..6123005a68 100644
--- a/captum/influence/_core/similarity_influence.py
+++ b/captum/influence/_core/similarity_influence.py
@@ -94,7 +94,7 @@ def __init__(
             model_id (str): The name/version of the model for which layer
                         activations are being computed. Activations will be stored and
                         loaded under the subdirectory with this name if provided.
-            similarity_metric (callable): This is a callable function that computes a
+            similarity_metric (Callable): This is a callable function that computes a
                     similarity metric between two representations. For example, the
                     representations pair could be from the training and test sets.
 
diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index 067bd85b9e..37c802e9ec 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -128,7 +128,7 @@ def __init__(
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
-            checkpoints_load_func (callable, optional): The function to load a saved
+            checkpoints_load_func (Callable, optional): The function to load a saved
                     checkpoint into a model to update its parameters, and get the
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
@@ -138,7 +138,7 @@ def __init__(
                     be computed for all layers. Otherwise, they will only be computed
                     for the layers specified in `layers`.
                     Default: None
-            loss_fn (callable, optional): The loss function applied to model.
+            loss_fn (Callable, optional): The loss function applied to model.
                     Default: None
             batch_size (int or None, optional): Batch size of the DataLoader created to
                     iterate through `influence_src_dataset`, if it is a Dataset.
@@ -329,7 +329,7 @@ def influence(  # type: ignore[override]
 
         Args:
 
-            inputs (any, optional): If not provided or `None`, the self influence mode
+            inputs (Any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
                     mode. If the argument `unpack_inputs` is False, the
@@ -483,7 +483,7 @@ def __init__(
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
-            checkpoints_load_func (callable, optional): The function to load a saved
+            checkpoints_load_func (Callable, optional): The function to load a saved
                     checkpoint into a model to update its parameters, and get the
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
@@ -493,7 +493,7 @@ def __init__(
                     be computed for all layers. Otherwise, they will only be computed
                     for the layers specified in `layers`.
                     Default: None
-            loss_fn (callable, optional): The loss function applied to model. There
+            loss_fn (Callable, optional): The loss function applied to model. There
                     are two options for the return type of `loss_fn`. First, `loss_fn`
                     can be a "per-example" loss function - returns a 1D Tensor of
                     losses for each example in a batch. `nn.BCELoss(reduction="none")`
@@ -651,7 +651,7 @@ def influence(  # type: ignore[override]
 
         Args:
 
-            inputs (any, optional): If not provided or `None`, the self influence mode
+            inputs (Any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
                     mode. If the argument `unpack_inputs` is False, the
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index 17232bf111..f675ea15f1 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -116,12 +116,12 @@ def __init__(
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
-            checkpoints_load_func (callable, optional): The function to load a saved
+            checkpoints_load_func (Callable, optional): The function to load a saved
                     checkpoint into a model to update its parameters, and get the
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
                     Default: _load_flexible_state_dict
-            loss_fn (callable, optional): The loss function applied to model. `loss_fn`
+            loss_fn (Callable, optional): The loss function applied to model. `loss_fn`
                     must be a "reduction" loss function that reduces the per-example
                     losses in a batch, and returns a single scalar Tensor. Furthermore,
                     the reduction must be the *sum* or the *mean* of the per-example
@@ -226,7 +226,7 @@ def influence(  # type: ignore[override]
 
         Args:
 
-            inputs (any, optional): If not provided or `None`, the self influence mode
+            inputs (Any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
                     mode. If the argument `unpack_inputs` is False, the
@@ -675,12 +675,12 @@ def __init__(
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
-            checkpoints_load_func (callable, optional): The function to load a saved
+            checkpoints_load_func (Callable, optional): The function to load a saved
                     checkpoint into a model to update its parameters, and get the
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
                     Default: _load_flexible_state_dict
-            loss_fn (callable, optional): The loss function applied to model. `loss_fn`
+            loss_fn (Callable, optional): The loss function applied to model. `loss_fn`
                     must be a "reduction" loss function that reduces the per-example
                     losses in a batch, and returns a single scalar Tensor. Furthermore,
                     the reduction must be the *sum* of the per-example losses. For
@@ -932,7 +932,7 @@ def influence(  # type: ignore[override]
 
         Args:
 
-            inputs (any, optional): If not provided or `None`, the self influence mode
+            inputs (Any, optional): If not provided or `None`, the self influence mode
                     will be run. Otherwise, `inputs` is the test batch that will be
                     used when running in either influence score or k-most influential
                     mode. If the argument `unpack_inputs` is False, the
diff --git a/captum/influence/_utils/common.py b/captum/influence/_utils/common.py
index 3da00c9e48..a186f8e4de 100644
--- a/captum/influence/_utils/common.py
+++ b/captum/influence/_utils/common.py
@@ -206,7 +206,7 @@ def _get_k_most_influential_helper(
     Args:
         influence_src_dataloader (DataLoader): The DataLoader, representing training
                 data, for which we want to compute proponents / opponents.
-        influence_batch_fn (callable): A callable that will be called via
+        influence_batch_fn (Callable): A callable that will be called via
                 `influence_batch_fn(inputs, targets, batch)`, where `batch` is a batch
                 in the `influence_src_dataloader` argument.
         inputs (tuple of Any): A batch of examples. Does not represent labels,
diff --git a/captum/insights/attr_vis/app.py b/captum/insights/attr_vis/app.py
index 04c30da245..10cc0a2048 100644
--- a/captum/insights/attr_vis/app.py
+++ b/captum/insights/attr_vis/app.py
@@ -166,7 +166,7 @@ def __init__(
             dataset (iterable of Batch): Defines the dataset to visualize attributions
                           for. This must be an iterable of batch objects, each of which
                           may contain multiple input examples.
-            score_func (callable, optional): This function is applied to the model
+            score_func (Callable, optional): This function is applied to the model
                           output to obtain the score for each class. For instance,
                           this function could be the softmax or final non-linearity
                           of the network, applied to the model output. The indices
diff --git a/captum/insights/attr_vis/features.py b/captum/insights/attr_vis/features.py
index 5b1d431f61..bd95bc47e9 100644
--- a/captum/insights/attr_vis/features.py
+++ b/captum/insights/attr_vis/features.py
@@ -43,16 +43,16 @@ def __init__(
 
             name (str): The label of the specific feature. For example, an
                         ImageFeature's name can be "Photo".
-            baseline_transforms (list, callable, optional): Optional list of
+            baseline_transforms (list, Callable, optional): Optional list of
                         callables (e.g. functions) to be called on the input tensor
                         to construct multiple baselines. Currently only one baseline
                         is supported. See
                         :py:class:`.IntegratedGradients` for more
                         information about baselines.
-            input_transforms (list, callable, optional): Optional list of callables
+            input_transforms (list, Callable, optional): Optional list of callables
                         (e.g. functions) called on the input tensor sequentially to
                         convert it into the format expected by the model.
-            visualization_transform (callable, optional): Optional callable (e.g.
+            visualization_transform (Callable, optional): Optional callable (e.g.
                         function) applied as a postprocessing step of the original
                         input data (before ``input_transforms``) to convert it to a
                         format to be understood by the frontend visualizer as
@@ -89,16 +89,16 @@ def __init__(
         Args:
             name (str): The label of the specific feature. For example, an
                         ImageFeature's name can be "Photo".
-            baseline_transforms (list, callable, optional): Optional list of
+            baseline_transforms (list, Callable, optional): Optional list of
                         callables (e.g. functions) to be called on the input tensor
                         to construct multiple baselines. Currently only one baseline
                         is supported. See
                         :py:class:`.IntegratedGradients` for more
                         information about baselines.
-            input_transforms (list, callable, optional): A list of transforms
+            input_transforms (list, Callable, optional): A list of transforms
                         or transform to be applied to the input. For images,
                         normalization is often applied here.
-            visualization_transform (callable, optional): Optional callable (e.g.
+            visualization_transform (Callable, optional): Optional callable (e.g.
                         function) applied as a postprocessing step of the original
                         input data (before input_transforms) to convert it to a
                         format to be visualized.
@@ -164,7 +164,7 @@ def __init__(
         Args:
             name (str): The label of the specific feature. For example, an
                         ImageFeature's name can be "Photo".
-            baseline_transforms (list, callable, optional): Optional list of
+            baseline_transforms (list, Callable, optional): Optional list of
                         callables (e.g. functions) to be called on the input tensor
                         to construct multiple baselines. Currently only one baseline
                         is supported. See
@@ -174,7 +174,7 @@ def __init__(
                         corresponding to PAD with the same size as the input
                         tensor. See :py:class:`.TokenReferenceBase` for more
                         information.
-            input_transforms (list, callable, optional): A list of transforms
+            input_transforms (list, Callable, optional): A list of transforms
                         or transform to be applied to the input. For text, a common
                         transform is to convert the tokenized input tensor into an
                         interpretable embedding. See
@@ -182,7 +182,7 @@ def __init__(
                         and
                         :py:func:`~.configure_interpretable_embedding_layer`
                         for more information.
-            visualization_transform (callable, optional): Optional callable (e.g.
+            visualization_transform (Callable, optional): Optional callable (e.g.
                         function) applied as a postprocessing step of the original
                         input data (before ``input_transforms``) to convert it to a
                         suitable format for visualization. For text features,
diff --git a/captum/metrics/_core/infidelity.py b/captum/metrics/_core/infidelity.py
index ecbbcf9bfd..e8e1599f1e 100644
--- a/captum/metrics/_core/infidelity.py
+++ b/captum/metrics/_core/infidelity.py
@@ -44,12 +44,12 @@ def sub_infidelity_perturb_func_decorator(pertub_func: Callable) -> Callable:
         r"""
         Args:
 
-            pertub_func(callable): Input perturbation function that takes inputs
+            pertub_func(Callable): Input perturbation function that takes inputs
                 and optionally baselines and returns perturbed inputs
 
         Returns:
 
-            default_perturb_func(callable): Internal default perturbation
+            default_perturb_func(Callable): Internal default perturbation
             function that computes the perturbations internally and returns
             perturbations and perturbed inputs.
 
@@ -147,10 +147,10 @@ def infidelity(
 
     Args:
 
-        forward_func (callable):
+        forward_func (Callable):
                 The forward function of the model or any modification of it.
 
-        perturb_func (callable):
+        perturb_func (Callable):
                 The perturbation function of model inputs. This function takes
                 model inputs and optionally baselines as input arguments and returns
                 either a tuple of perturbations and perturbed inputs or just
@@ -292,7 +292,7 @@ def infidelity(
                 tensor as well. If inputs is provided as a tuple of tensors
                 then attributions will be tuples of tensors as well.
 
-        additional_forward_args (any, optional): If the forward function
+        additional_forward_args (Any, optional): If the forward function
                 requires additional arguments other than the inputs for
                 which attributions should not be computed, this argument
                 can be provided. It must be either a single additional
diff --git a/captum/metrics/_core/sensitivity.py b/captum/metrics/_core/sensitivity.py
index 1cff46a3c1..1f8d987a67 100644
--- a/captum/metrics/_core/sensitivity.py
+++ b/captum/metrics/_core/sensitivity.py
@@ -103,7 +103,7 @@ def sensitivity_max(
 
     Args:
 
-        explanation_func (callable):
+        explanation_func (Callable):
                 This function can be the `attribute` method of an
                 attribution algorithm or any other explanation method
                 that returns the explanations.
@@ -119,7 +119,7 @@ def sensitivity_max(
                 multiple input tensors are provided, the examples must
                 be aligned appropriately.
 
-        perturb_func (callable):
+        perturb_func (Callable):
                 The perturbation function of model inputs. This function takes
                 model inputs and optionally `perturb_radius` if
                 the function takes more than one argument and returns
@@ -166,7 +166,7 @@ def sensitivity_max(
                 `input batch size * n_perturb_samples`.
 
                 Default: None
-         **kwargs (any, optional): Contains a list of arguments that are passed
+         **kwargs (Any, optional): Contains a list of arguments that are passed
                 to `explanation_func` explanation function which in some cases
                 could be the `attribute` function of an attribution algorithm.
                 Any additional arguments that need be passed to the explanation
diff --git a/captum/metrics/_utils/batching.py b/captum/metrics/_utils/batching.py
index ee3b38f58e..c906307c09 100644
--- a/captum/metrics/_utils/batching.py
+++ b/captum/metrics/_utils/batching.py
@@ -28,9 +28,9 @@ def _divide_and_aggregate_metrics(
                         attributions for.
         n_perturb_samples (int): The number of samples per example that are used for
                         perturbation purposes for example.
-        metric_func (callable): This function takes the number of samples per
+        metric_func (Callable): This function takes the number of samples per
                         input batch and returns an overall metric for each example.
-        agg_func (callable, optional): This function is used to aggregate the
+        agg_func (Callable, optional): This function is used to aggregate the
                         metrics across multiple sub-batches and that are
                         generated by `metric_func`.
         max_examples_per_batch (int, optional): The maximum number of allowed examples
diff --git a/captum/robust/_core/fgsm.py b/captum/robust/_core/fgsm.py
index f5de571bd9..153b02ae47 100644
--- a/captum/robust/_core/fgsm.py
+++ b/captum/robust/_core/fgsm.py
@@ -43,9 +43,9 @@ def __init__(
     ) -> None:
         r"""
         Args:
-            forward_func (callable): The pytorch model for which the attack is
+            forward_func (Callable): The pytorch model for which the attack is
                         computed.
-            loss_func (callable, optional): Loss function of which the gradient
+            loss_func (Callable, optional): Loss function of which the gradient
                         computed. The loss function should take in outputs of the
                         model and labels, and return a loss tensor.
                         The default loss function is negative log.
@@ -54,7 +54,7 @@ def __init__(
                         e.g. image pixels must be in the range 0-255
 
         Attributes:
-            bound (callable): A function that bounds the input values based on
+            bound (Callable): A function that bounds the input values based on
                         given lower_bound and upper_bound. Can be overwritten for
                         custom use cases if necessary.
             zero_thresh (float): The threshold below which gradient will be treated
@@ -86,7 +86,7 @@ def perturb(
                         input tensors are provided, the batch sizes must be
                         aligned accross all tensors.
             epsilon (float): Step size of perturbation.
-            target (any): True labels of inputs if non-targeted attack is
+            target (Any): True labels of inputs if non-targeted attack is
                         desired. Target class of inputs if targeted attack
                         is desired. Target will be passed to the loss function
                         to compute loss, so the type needs to match the
@@ -112,7 +112,7 @@ def perturb(
                           examples in inputs (dim 0), and each tuple containing
                           #output_dims - 1 elements. Each tuple is applied as the
                           label for the corresponding example.
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. These arguments are provided to
diff --git a/captum/robust/_core/metrics/attack_comparator.py b/captum/robust/_core/metrics/attack_comparator.py
index 890cc01720..ccc686ca1c 100644
--- a/captum/robust/_core/metrics/attack_comparator.py
+++ b/captum/robust/_core/metrics/attack_comparator.py
@@ -64,11 +64,11 @@ def __init__(
     ) -> None:
         r"""
         Args:
-            forward_func (callable or torch.nn.Module): This can either be an instance
+            forward_func (Callable or torch.nn.Module): This can either be an instance
                 of pytorch model or any modification of a model's forward
                 function.
 
-            metric (callable): This function is applied to the model output in
+            metric (Callable): This function is applied to the model output in
                 order to compute the desired performance metric or metrics.
                 This function should have the following signature::
 
@@ -85,7 +85,7 @@ def __init__(
                 If tensor metrics represent results for the full batch, the size of the
                 first dimension should be 1.
 
-            preproc_fn (callable, optional): Optional method applied to inputs. Output
+            preproc_fn (Callable, optional): Optional method applied to inputs. Output
                 of preproc_fn is then provided as input to model, in addition to
                 additional_forward_args provided to evaluate.
         """
@@ -113,12 +113,12 @@ def add_attack(
         Adds attack to be evaluated when calling evaluate.
 
         Args:
-            attack (perturbation or callable): This can either be an instance
+            attack (Perturbation or Callable): This can either be an instance
                 of a Captum Perturbation / Attack
                 or any other perturbation or attack function such
                 as a torchvision transform.
 
-            name (optional, str): Name or identifier for attack, used as key for
+            name (str, optional): Name or identifier for attack, used as key for
                 attack results. This defaults to attack.__class__.__name__
                 if not provided and must be unique for all added attacks.
 
@@ -239,7 +239,7 @@ def evaluate(
 
         Args:
 
-        inputs (any): Input for which attack metrics
+        inputs (Any): Input for which attack metrics
                 are computed. It can be provided as a tensor, tuple of tensors,
                 or any raw input type (e.g. PIL image or text string).
                 This input is provided directly as input to preproc function as well
@@ -247,7 +247,7 @@ def evaluate(
                 function is provided, this input is provided directly to the main
                 model and all attacks.
 
-        additional_forward_args (any, optional): If the forward function
+        additional_forward_args (Any, optional): If the forward function
                 requires additional arguments other than the preprocessing
                 outputs (or inputs if preproc_fn is None), this argument
                 can be provided. It must be either a single additional
@@ -273,7 +273,7 @@ def evaluate(
                 (or inputs itself if no preproc_fn is provided) must be a tensor
                 or tuple of tensors.
                 Default: 1
-        kwargs (any, optional): Additional keyword arguments provided to metric function
+        kwargs (Any, optional): Additional keyword arguments provided to metric function
                 as well as selected attacks based on chosen additional_args
 
         Returns:
diff --git a/captum/robust/_core/metrics/min_param_perturbation.py b/captum/robust/_core/metrics/min_param_perturbation.py
index 0bb6dc1a43..92b54d8ff5 100644
--- a/captum/robust/_core/metrics/min_param_perturbation.py
+++ b/captum/robust/_core/metrics/min_param_perturbation.py
@@ -63,11 +63,11 @@ def __init__(
         corresponding perturbed input.
 
         Args:
-            forward_func (callable or torch.nn.Module): This can either be an instance
+            forward_func (Callable or torch.nn.Module): This can either be an instance
                 of pytorch model or any modification of a model's forward
                 function.
 
-            attack (Perturbation or callable): This can either be an instance
+            attack (Perturbation or Callable): This can either be an instance
                 of a Captum Perturbation / Attack
                 or any other perturbation or attack function such
                 as a torchvision transform.
@@ -94,7 +94,7 @@ def __init__(
                 perturbation / attack functions
                 Default: 1
 
-            preproc_fn (callable, optional): Optional method applied to inputs. Output
+            preproc_fn (Callable, optional): Optional method applied to inputs. Output
                 of preproc_fn is then provided as input to model, in addition to
                 additional_forward_args provided to evaluate.
                 Default: None
@@ -103,7 +103,7 @@ def __init__(
                 applied before or after preproc function.
                 Default: False
 
-            correct_fn (callable, optional): This determines whether the perturbed input
+            correct_fn (Callable, optional): This determines whether the perturbed input
                 leads to a correct or incorrect prediction. By default, this function
                 is set to the standard classification test for correctness
                 (comparing argmax of output with target), which requires model output to
@@ -355,7 +355,7 @@ def evaluate(
 
         Args:
 
-            inputs (any): Input for which minimal perturbation
+            inputs (Any): Input for which minimal perturbation
                     is computed. It can be provided as a tensor, tuple of tensors,
                     or any raw input type (e.g. PIL image or text string).
                     This input is provided directly as input to preproc function
@@ -363,7 +363,7 @@ def evaluate(
                     pre-processing function is provided,
                     this input is provided directly to the main model and all attacks.
 
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                     requires additional arguments other than the preprocessing
                     outputs (or inputs if preproc_fn is None), this argument
                     can be provided. It must be either a single additional
@@ -402,7 +402,7 @@ def evaluate(
             Tuple of (perturbed_inputs, param_val) if successful
             else Tuple of (None, None)
 
-            - **perturbed inputs** (any):
+            - **perturbed inputs** (Any):
                    Perturbed input (output of attack) which results in incorrect
                    prediction.
             - param_val (int, float)
diff --git a/captum/robust/_core/pgd.py b/captum/robust/_core/pgd.py
index 56dfa34038..6b4b01b749 100644
--- a/captum/robust/_core/pgd.py
+++ b/captum/robust/_core/pgd.py
@@ -44,9 +44,9 @@ def __init__(
     ) -> None:
         r"""
         Args:
-            forward_func (callable): The pytorch model for which the attack is
+            forward_func (Callable): The pytorch model for which the attack is
                         computed.
-            loss_func (callable, optional): Loss function of which the gradient
+            loss_func (Callable, optional): Loss function of which the gradient
                         computed. The loss function should take in outputs of the
                         model and labels, and return the loss for each input tensor.
                         The default loss function is negative log.
@@ -55,7 +55,7 @@ def __init__(
                         e.g. image pixels must be in the range 0-255
 
         Attributes:
-            bound (callable): A function that bounds the input values based on
+            bound (Callable): A function that bounds the input values based on
                         given lower_bound and upper_bound. Can be overwritten for
                         custom use cases if necessary.
         """
@@ -92,7 +92,7 @@ def perturb(
             step_size (float): Step size of each gradient step.
             step_num (int): Step numbers. It usually guarantees that the perturbation
                         can reach the border.
-            target (any): True labels of inputs if non-targeted attack is
+            target (Any): True labels of inputs if non-targeted attack is
                         desired. Target class of inputs if targeted attack
                         is desired. Target will be passed to the loss function
                         to compute loss, so the type needs to match the
@@ -118,7 +118,7 @@ def perturb(
                           examples in inputs (dim 0), and each tuple containing
                           #output_dims - 1 elements. Each tuple is applied as the
                           label for the corresponding example.
-            additional_forward_args (any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
                         can be provided. These arguments are provided to

From 12e0250e6fdc84bb4829f5cd6785694a2b1f086a Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 14:45:56 -0600
Subject: [PATCH 30/84] Fix more Sphinx warnings

---
 .../_core/layer/layer_integrated_gradients.py |  5 +++--
 captum/attr/_core/layer/layer_lrp.py          | 22 +++++++++----------
 captum/attr/_core/lrp.py                      |  5 +++--
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 12a52cb16a..26182f9437 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -213,6 +213,7 @@ def attribute(
 
                         - a tuple of tensors or scalars, the baseline corresponding
                           to each tensor in the inputs' tuple can be:
+
                             - either a tensor with matching dimensions to
                               corresponding tensor in the inputs' tuple
                               or the first dimension is one and the remaining
@@ -308,7 +309,7 @@ def attribute(
 
             Returns:
                 **attributions** or 2-element tuple of **attributions**, **delta**:
-                - **attributions** (*tensor*, tuple of *tensors* or tuple of *tensors*):
+                  - **attributions** (*tensor*, tuple of *tensors* or tuple of *tensors*):
                         Integrated gradients with respect to `layer`'s inputs or
                         outputs. Attributions will always be the same size and
                         dimensionality as the input or output of the given layer,
@@ -327,7 +328,7 @@ def attribute(
                         multiple tensors: the corresponding output element will be
                         a tuple of tensors. The ordering of the outputs will be
                         the same order as the layers given in the constructor.
-                - **delta** (*tensor*, returned if return_convergence_delta=True):
+                  - **delta** (*tensor*, returned if return_convergence_delta=True):
                         The difference between the total approximated and true
                         integrated gradients. This is computed using the property
                         that the total sum of forward_func(inputs) -
diff --git a/captum/attr/_core/layer/layer_lrp.py b/captum/attr/_core/layer/layer_lrp.py
index 474f872c71..2f82c199b2 100644
--- a/captum/attr/_core/layer/layer_lrp.py
+++ b/captum/attr/_core/layer/layer_lrp.py
@@ -50,7 +50,6 @@ def __init__(self, model: Module, layer: ModuleOrModuleList) -> None:
                         these are not supported by the register_full_backward_hook
                         PyTorch API starting from PyTorch v1.9.
 
-
             layer (torch.nn.Module or list of torch.nn.Module): Layer or layers
                           for which attributions are computed.
                           The size and dimensionality of the attributions
@@ -110,8 +109,8 @@ def attribute(
         ],
     ]:
         r"""
-
         Args:
+
             inputs (tensor or tuple of tensors): Input for which relevance is
                         propagated.
                         If forward_func takes a single
@@ -122,11 +121,11 @@ def attribute(
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
             target (int, tuple, tensor or list, optional): Output indices for
-                        which gradients are computed (for classification cases,
-                        this is usually the target class).
-                        If the network returns a scalar value per example,
-                        no target index is necessary.
-                        For general 2D outputs, targets can be either:
+                    which gradients are computed (for classification cases,
+                    this is usually the target class).
+                    If the network returns a scalar value per example,
+                    no target index is necessary.
+                    For general 2D outputs, targets can be either:
 
                     - a single integer or a tensor containing a single
                         integer, which is applied to all input examples
@@ -177,8 +176,9 @@ def attribute(
 
         Returns:
             *tensor* or tuple of *tensors* of **attributions** or 2-element tuple of
-                **attributions**, **delta** or lists of **attributions** and **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            **attributions**, **delta** or lists of **attributions** and **delta**:
+
+              - **attributions** (*tensor* or tuple of *tensors*):
                         The propagated relevance values with respect to each
                         input feature. Attributions will always
                         be the same size as the provided inputs, with each value
@@ -190,8 +190,8 @@ def attribute(
                         implementations. If attributions for all layers are returned
                         (layer=None) a list of tensors or tuples of tensors is returned
                         with entries for each layer.
-            - **delta** (*tensor* or list of *tensors*
-                         returned if return_convergence_delta=True):
+              - **delta** (*tensor* or list of *tensors*
+                        returned if return_convergence_delta=True):
                         Delta is calculated per example, meaning that the number of
                         elements in returned delta tensor is equal to the number of
                         of examples in input.
diff --git a/captum/attr/_core/lrp.py b/captum/attr/_core/lrp.py
index bcffd0304a..bd8d887a76 100644
--- a/captum/attr/_core/lrp.py
+++ b/captum/attr/_core/lrp.py
@@ -157,7 +157,8 @@ def attribute(
         Returns:
             *tensor* or tuple of *tensors* of **attributions**
             or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+
+              - **attributions** (*tensor* or tuple of *tensors*):
                         The propagated relevance values with respect to each
                         input feature. The values are normalized by the output score
                         value (sum(relevance)=1). To obtain values comparable to other
@@ -171,7 +172,7 @@ def attribute(
                         is one and not corresponding to the prediction score as in other
                         implementations.
 
-            - **delta** (*tensor*, returned if return_convergence_delta=True):
+              - **delta** (*tensor*, returned if return_convergence_delta=True):
                         Delta is calculated per example, meaning that the number of
                         elements in returned delta tensor is equal to the number of
                         of examples in the inputs.

From 7838aafe081f02420520b1b760a9a5371e1a8492 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 14:52:49 -0600
Subject: [PATCH 31/84] Fix: E501 line too long

---
 captum/attr/_core/layer/layer_integrated_gradients.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 26182f9437..7a75623d7b 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -309,7 +309,8 @@ def attribute(
 
             Returns:
                 **attributions** or 2-element tuple of **attributions**, **delta**:
-                  - **attributions** (*tensor*, tuple of *tensors* or tuple of *tensors*):
+                  - **attributions** (*tensor*, tuple of *tensors* or tuple of
+				  *tensors*):
                         Integrated gradients with respect to `layer`'s inputs or
                         outputs. Attributions will always be the same size and
                         dimensionality as the input or output of the given layer,

From 31f5d5aba59a1b911dae8905d669c2319c1bcf07 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 15:18:15 -0600
Subject: [PATCH 32/84] Replace accidental tabs with spaces

---
 captum/attr/_core/layer/layer_integrated_gradients.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 7a75623d7b..cc17745514 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -310,7 +310,7 @@ def attribute(
             Returns:
                 **attributions** or 2-element tuple of **attributions**, **delta**:
                   - **attributions** (*tensor*, tuple of *tensors* or tuple of
-				  *tensors*):
+                    *tensors*):
                         Integrated gradients with respect to `layer`'s inputs or
                         outputs. Attributions will always be the same size and
                         dimensionality as the input or output of the given layer,

From cedcffca247b5d037dc132570e4081c1c043a24d Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 15:35:06 -0600
Subject: [PATCH 33/84] Fix spacing issue

---
 captum/influence/_core/tracincp.py               | 16 ++++++++--------
 .../influence/_core/tracincp_fast_rand_proj.py   |  6 +++---
 captum/influence/_utils/common.py                |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index 37c802e9ec..92c1fb3282 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -197,7 +197,7 @@ def _self_influence(self, show_progress: bool = False):
             show_progress (bool, optional): To compute the self influence scores for
                     all examples in training dataset `influence_src_dataset`, we
                     compute the self influence scores for each batch. If
-                    `show_progress`is true, the progress of this computation will be
+                    `show_progress` is true, the progress of this computation will be
                     displayed. In particular, the number of batches for which self
                     influence scores have been computed will be displayed. It will
                     try to use tqdm if available for advanced features (e.g. time
@@ -232,7 +232,7 @@ def _get_k_most_influential(
                     Default: True
             show_progress (bool, optional): To compute the proponents (or opponents)
                     for the batch of examples, we perform computation for each batch in
-                    training dataset `influence_src_dataset`, If `show_progress`is
+                    training dataset `influence_src_dataset`, If `show_progress` is
                     true, the progress of this computation will be displayed. In
                     particular, the number of batches for which the computation has
                     been performed will be displayed. It will try to use tqdm if
@@ -284,7 +284,7 @@ def _influence(
                     example to the i-th input example.
             show_progress (bool, optional): To compute the influence of examples in
                     training dataset `influence_src_dataset`, we compute the influence
-                    of each batch. If `show_progress`is true, the progress of this
+                    of each batch. If `show_progress` is true, the progress of this
                     computation will be displayed. In particular, the number of batches
                     for which influence has been computed will be displayed. It will
                     try to use tqdm if available for advanced features (e.g. time
@@ -359,7 +359,7 @@ def influence(  # type: ignore[override]
             show_progress (bool, optional): For all modes, computation of results
                     requires "training dataset computations": computations for each
                     batch in the training dataset `influence_src_dataset`, which may
-                    take a long time. If `show_progress`is true, the progress of
+                    take a long time. If `show_progress` is true, the progress of
                     "training dataset computations" will be displayed. In particular,
                     the number of batches for which computations have been performed
                     will be displayed. It will try to use tqdm if available for
@@ -681,7 +681,7 @@ def influence(  # type: ignore[override]
             show_progress (bool, optional): For all modes, computation of results
                     requires "training dataset computations": computations for each
                     batch in the training dataset `influence_src_dataset`, which may
-                    take a long time. If `show_progress`is true, the progress of
+                    take a long time. If `show_progress` is true, the progress of
                     "training dataset computations" will be displayed. In particular,
                     the number of batches for which computations have been performed
                     will be displayed. It will try to use tqdm if available for
@@ -791,7 +791,7 @@ def _influence(
                     Default: None
             show_progress (bool, optional): To compute the influence of examples in
                     training dataset `influence_src_dataset`, we compute the influence
-                    of each batch. If `show_progress`is true, the progress of this
+                    of each batch. If `show_progress` is true, the progress of this
                     computation will be displayed. In particular, the number of batches
                     for which influence has been computed will be displayed. It will
                     try to use tqdm if available for advanced features (e.g. time
@@ -852,7 +852,7 @@ def _get_k_most_influential(
                     Default: True
             show_progress (bool, optional): To compute the proponents (or opponents)
                     for the batch of examples, we perform computation for each batch in
-                    training dataset `influence_src_dataset`, If `show_progress`is
+                    training dataset `influence_src_dataset`, If `show_progress` is
                     true, the progress of this computation will be displayed. In
                     particular, the number of batches for which the computation has
                     been performed will be displayed. It will try to use tqdm if
@@ -953,7 +953,7 @@ def _self_influence(self, show_progress: bool = False):
             show_progress (bool, optional): To compute the self influence scores for
                     all examples in training dataset `influence_src_dataset`, we
                     compute the self influence scores for each batch. If
-                    `show_progress`is true, the progress of this computation will be
+                    `show_progress` is true, the progress of this computation will be
                     displayed. In particular, the number of batches for which self
                     influence scores have been computed will be displayed. It will
                     try to use tqdm if available for advanced features (e.g. time
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index f675ea15f1..c14955fea0 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -369,7 +369,7 @@ def _influence(  # type: ignore[override]
                     are required.
             show_progress (bool, optional): To compute the influence of examples in
                     training dataset `influence_src_dataset`, we compute the influence
-                    of each batch. If `show_progress`is true, the progress of this
+                    of each batch. If `show_progress` is true, the progress of this
                     computation will be displayed. In particular, the number of batches
                     for which influence has been computed will be displayed. It will
                     try to use tqdm if available for advanced features (e.g. time
@@ -432,7 +432,7 @@ def _get_k_most_influential(  # type: ignore[override]
                     Default: True
             show_progress (bool, optional): To compute the proponents (or opponents)
                     for the batch of examples, we perform computation for each batch in
-                    training dataset `influence_src_dataset`, If `show_progress`is
+                    training dataset `influence_src_dataset`, If `show_progress` is
                     true, the progress of this computation will be displayed. In
                     particular, the number of batches for which the computation has
                     been performed will be displayed. It will try to use tqdm if
@@ -519,7 +519,7 @@ def _self_influence(self, show_progress: bool = False):
             show_progress (bool, optional): To compute the self influence scores for
                     all examples in training dataset `influence_src_dataset`, we
                     compute the self influence scores for each batch. If
-                    `show_progress`is true, the progress of this computation will be
+                    `show_progress` is true, the progress of this computation will be
                     displayed. In particular, the number of batches for which self
                     influence scores have been computed will be displayed. It will
                     try to use tqdm if available for advanced features (e.g. time
diff --git a/captum/influence/_utils/common.py b/captum/influence/_utils/common.py
index a186f8e4de..6f0931c721 100644
--- a/captum/influence/_utils/common.py
+++ b/captum/influence/_utils/common.py
@@ -222,7 +222,7 @@ def _get_k_most_influential_helper(
                 Default: True
         show_progress (bool, optional): To compute the proponents (or opponents)
                 for the batch of examples, we perform computation for each batch in
-                training dataset `influence_src_dataloader`, If `show_progress`is
+                training dataset `influence_src_dataloader`, If `show_progress` is
                 true, the progress of this computation will be displayed. In
                 particular, the number of batches for which the computation has
                 been performed will be displayed. It will try to use tqdm if

From ccdd660ad2e69d7223d66d68c77af7a8d82e7df2 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 15:43:32 -0600
Subject: [PATCH 34/84] Fix formatting

---
 captum/attr/_core/layer/internal_influence.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index fac9ce1cfd..7a1fc32241 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -188,12 +188,12 @@ def attribute(
 
         Returns:
             *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+              - **attributions** (*tensor* or tuple of *tensors*):
                         Internal influence of each neuron in given
                         layer output. Attributions will always be the same size
                         as the output or input of the given layer depending on
                         whether `attribute_to_layer_input` is set to `False` or
-                        `True`respectively.
+                        `True` respectively.
                         Attributions are returned in a tuple if
                         the layer inputs / outputs contain multiple tensors,
                         otherwise a single tensor is returned.

From 9675b916eb844be85e8a4c652f97cd46cd82aed1 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 15:48:17 -0600
Subject: [PATCH 35/84] Fix warning

---
 captum/attr/_core/layer/layer_integrated_gradients.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index cc17745514..01e1a5daaa 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -310,7 +310,7 @@ def attribute(
             Returns:
                 **attributions** or 2-element tuple of **attributions**, **delta**:
                   - **attributions** (*tensor*, tuple of *tensors* or tuple of
-                    *tensors*):
+                  *tensors*):
                         Integrated gradients with respect to `layer`'s inputs or
                         outputs. Attributions will always be the same size and
                         dimensionality as the input or output of the given layer,

From 6cb53697f655ac978d623b6e005b637f86a6271a Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 15:54:54 -0600
Subject: [PATCH 36/84] Fix warning

---
 captum/attr/_core/layer/layer_integrated_gradients.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 01e1a5daaa..4802da6067 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -309,10 +309,10 @@ def attribute(
 
             Returns:
                 **attributions** or 2-element tuple of **attributions**, **delta**:
+
                   - **attributions** (*tensor*, tuple of *tensors* or tuple of
-                  *tensors*):
-                        Integrated gradients with respect to `layer`'s inputs or
-                        outputs. Attributions will always be the same size and
+                  *tensors*): Integrated gradients with respect to `layer`'s inputs
+                        or outputs. Attributions will always be the same size and
                         dimensionality as the input or output of the given layer,
                         depending on whether we attribute to the inputs or outputs
                         of the layer which is decided by the input flag

From 221a7d939fbd69b6c3b6d50d3c46967eeed1cd45 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 21 Jul 2022 16:08:36 -0600
Subject: [PATCH 37/84] Fix warning

---
 captum/attr/_core/layer/layer_integrated_gradients.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 4802da6067..eb2d0762c2 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -329,6 +329,7 @@ def attribute(
                         multiple tensors: the corresponding output element will be
                         a tuple of tensors. The ordering of the outputs will be
                         the same order as the layers given in the constructor.
+
                   - **delta** (*tensor*, returned if return_convergence_delta=True):
                         The difference between the total approximated and true
                         integrated gradients. This is computed using the property

From 2c0331f1b5d01da185b91d853fe9f4b8e62776a2 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Fri, 22 Jul 2022 08:45:49 -0600
Subject: [PATCH 38/84] Improve docstring spacing & types

---
 captum/attr/_utils/visualization.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/captum/attr/_utils/visualization.py b/captum/attr/_utils/visualization.py
index 736244ba45..0f50b0cff8 100644
--- a/captum/attr/_utils/visualization.py
+++ b/captum/attr/_utils/visualization.py
@@ -184,7 +184,7 @@ def visualize_image_attr(
                     `blended_heat_map` visualization mode, which overlays the
                     heat map over the greyscaled original image.
                     Default: 0.5
-        show_colorbar (boolean, optional): Displays colorbar for heatmap below
+        show_colorbar (bool, optional): Displays colorbar for heatmap below
                     the visualization. If given method does not use a heatmap,
                     then a colormap axis is created and hidden. This is
                     necessary for appropriate alignment when visualizing
@@ -195,7 +195,7 @@ def visualize_image_attr(
                     Default: None
         fig_size (tuple, optional): Size of figure created.
                     Default: (6,6)
-        use_pyplot (boolean, optional): If true, uses pyplot to create and show
+        use_pyplot (bool, optional): If true, uses pyplot to create and show
                     figure and displays the figure after creating. If False,
                     uses Matplotlib object oriented API and simply returns a
                     figure object without showing.
@@ -371,7 +371,7 @@ def visualize_image_attr_multiple(
                     Default: None
         fig_size (tuple, optional): Size of figure created.
                     Default: (8, 6)
-        use_pyplot (boolean, optional): If true, uses pyplot to create and show
+        use_pyplot (bool, optional): If true, uses pyplot to create and show
                     figure and displays the figure after creating. If False,
                     uses Matplotlib object oriented API and simply returns a
                     figure object without showing.
@@ -474,7 +474,7 @@ def visualize_timeseries_attr(
                     points on the x-axis. Shape must be in the form (N, ). If
                     not provided, integers from 0 to N-1 are used.
                     Default: None
-        method (string, optional): Chosen method for visualizing attributions
+        method (str, optional): Chosen method for visualizing attributions
                     overlaid onto data. Supported options are:
 
                     1. `overlay_individual` - Plot each channel individually in
@@ -489,8 +489,9 @@ def visualize_timeseries_attr(
                         and color the graphs according to the attribution
                         values. Works best with color maps that does not contain
                         white or very bright colors.
+
                     Default: `overlay_individual`
-        sign (string, optional): Chosen sign of attributions to visualize.
+        sign (str, optional): Chosen sign of attributions to visualize.
                     Supported options are:
 
                     1. `positive` - Displays only positive pixel attributions.
@@ -502,8 +503,9 @@ def visualize_timeseries_attr(
 
                     4. `all` - Displays both positive and negative attribution
                         values.
+
                     Default: `absolute_value`
-        channel_labels (list of strings, optional): List of labels
+        channel_labels (list of str, optional): List of labels
                     corresponding to each channel in data.
                     Default: None
         channels_last (bool, optional): If True, data is expected to have
@@ -521,7 +523,7 @@ def visualize_timeseries_attr(
                     and scale value are computed using absolute value of
                     attributions.
                     Default: 2
-        cmap (string, optional): String corresponding to desired colormap for
+        cmap (str, optional): String corresponding to desired colormap for
                     heatmap visualization. This defaults to "Reds" for negative
                     sign, "Blues" for absolute value, "Greens" for positive sign,
                     and a spectrum from red to green for all. Note that this
@@ -531,14 +533,14 @@ def visualize_timeseries_attr(
                     `blended_heat_map` visualization mode, which overlays the
                     heat map over the greyscaled original image.
                     Default: 0.7
-        show_colorbar (boolean): Displays colorbar for heat map below
+        show_colorbar (bool): Displays colorbar for heat map below
                     the visualization.
-        title (string, optional): Title string for plot. If None, no title is
+        title (str, optional): Title string for plot. If None, no title is
                     set.
                     Default: None
         fig_size (tuple, optional): Size of figure created.
                     Default: (6,6)
-        use_pyplot (boolean): If true, uses pyplot to create and show
+        use_pyplot (bool): If true, uses pyplot to create and show
                     figure and displays the figure after creating. If False,
                     uses Matplotlib object oriented API and simply returns a
                     figure object without showing.

From f14c460dcca5bfa131fd82e0c633038b54334a1e Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Fri, 22 Jul 2022 11:49:41 -0600
Subject: [PATCH 39/84] Fix ReadMe

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5f415f7e0a..5731b789d2 100644
--- a/README.md
+++ b/README.md
@@ -159,8 +159,7 @@ model.eval()
 Next, we need to define simple input and baseline tensors.
 Baselines belong to the input space and often carry no predictive signal.
 Zero tensor can serve as a baseline for many tasks.
-Some interpretability algorithms such as `Integrated
-Gradients`, `Deeplift` and `GradientShap` are designed to attribute the change
+Some interpretability algorithms such as `IntegratedGradients`, `Deeplift` and `GradientShap` are designed to attribute the change
 between the input and baseline to a predictive class or a value that the neural
 network outputs.
 

From 24bfcdff3c0a698c89f2c0baee4bcccd73a08dda Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Fri, 22 Jul 2022 13:11:39 -0600
Subject: [PATCH 40/84] Fix Robustness docs

---
 captum/robust/_core/fgsm.py                   | 26 +++++++----
 .../robust/_core/metrics/attack_comparator.py | 43 +++++++++++--------
 .../_core/metrics/min_param_perturbation.py   | 22 +++++-----
 captum/robust/_core/pgd.py                    | 15 ++++---
 4 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/captum/robust/_core/fgsm.py b/captum/robust/_core/fgsm.py
index 153b02ae47..f7f0b2670d 100644
--- a/captum/robust/_core/fgsm.py
+++ b/captum/robust/_core/fgsm.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-from typing import Any, Callable, Tuple
+from typing import Any, Callable, Optional, Tuple
 
 import torch
 from captum._utils.common import (
@@ -22,22 +22,27 @@
 class FGSM(Perturbation):
     r"""
     Fast Gradient Sign Method is an one-step method that can generate
-    adversarial examples. For non-targeted attack, the formulation is
-    x' = x + epsilon * sign(gradient of L(theta, x, y)).
-    For targeted attack on t, the formulation is
-    x' = x - epsilon * sign(gradient of L(theta, x, t)).
-    L(theta, x, y) is the model's loss function with respect to model
+    adversarial examples.
+
+    For non-targeted attack, the formulation is::
+
+        x' = x + epsilon * sign(gradient of L(theta, x, y))
+
+    For targeted attack on t, the formulation is::
+
+        x' = x - epsilon * sign(gradient of L(theta, x, t))
+
+    ``L(theta, x, y)`` is the model's loss function with respect to model
     parameters, inputs and labels.
 
     More details on Fast Gradient Sign Method can be found in the original
-    paper:
-    https://arxiv.org/abs/1412.6572
+    paper: https://arxiv.org/abs/1412.6572
     """
 
     def __init__(
         self,
         forward_func: Callable,
-        loss_func: Callable = None,
+        loss_func: Optional[Callable] = None,
         lower_bound: float = float("-inf"),
         upper_bound: float = float("inf"),
     ) -> None:
@@ -50,8 +55,10 @@ def __init__(
                         model and labels, and return a loss tensor.
                         The default loss function is negative log.
             lower_bound (float, optional): Lower bound of input values.
+                        Default: ``float("-inf")``
             upper_bound (float, optional): Upper bound of input values.
                         e.g. image pixels must be in the range 0-255
+                        Default: ``float("inf")``
 
         Attributes:
             bound (Callable): A function that bounds the input values based on
@@ -112,6 +119,7 @@ def perturb(
                           examples in inputs (dim 0), and each tuple containing
                           #output_dims - 1 elements. Each tuple is applied as the
                           label for the corresponding example.
+
             additional_forward_args (Any, optional): If the forward function
                         requires additional arguments other than the inputs for
                         which attributions should not be computed, this argument
diff --git a/captum/robust/_core/metrics/attack_comparator.py b/captum/robust/_core/metrics/attack_comparator.py
index ccc686ca1c..9ea7b2cd7a 100644
--- a/captum/robust/_core/metrics/attack_comparator.py
+++ b/captum/robust/_core/metrics/attack_comparator.py
@@ -60,7 +60,7 @@ def __init__(
         self,
         forward_func: Callable,
         metric: Callable[..., MetricResultType],
-        preproc_fn: Callable = None,
+        preproc_fn: Optional[Callable] = None,
     ) -> None:
         r"""
         Args:
@@ -88,6 +88,7 @@ def __init__(
             preproc_fn (Callable, optional): Optional method applied to inputs. Output
                 of preproc_fn is then provided as input to model, in addition to
                 additional_forward_args provided to evaluate.
+                Default: ``None``
         """
         self.forward_func = forward_func
         self.metric: Callable = metric
@@ -113,6 +114,7 @@ def add_attack(
         Adds attack to be evaluated when calling evaluate.
 
         Args:
+
             attack (Perturbation or Callable): This can either be an instance
                 of a Captum Perturbation / Attack
                 or any other perturbation or attack function such
@@ -121,23 +123,29 @@ def add_attack(
             name (str, optional): Name or identifier for attack, used as key for
                 attack results. This defaults to attack.__class__.__name__
                 if not provided and must be unique for all added attacks.
+                Default: ``None``
 
-            num_attempts (int): Number of attempts that attack should be
+            num_attempts (int, optional): Number of attempts that attack should be
                 repeated. This should only be set to > 1 for non-deterministic
                 attacks. The minimum, maximum, and average (best, worst, and
                 average case) are tracked for attack attempts.
-
-            apply_before_preproc (bool): Defines whether attack should be applied
-                before or after preproc function.
-
-            attack_kwargs (dict): Additional arguments to be provided to given attack.
-                This should be provided as a dictionary of keyword arguments.
-
-            additional_attack_arg_names (list of str): Any additional arguments for the
-                attack which are specific to the particular input example or batch.
-                An example of this is target, which is necessary for some attacks such
-                as FGSM or PGD. These arguments are included if provided as a kwarg
-                to evaluate.
+                Default: ``1``
+
+            apply_before_preproc (bool, optional): Defines whether attack should be
+                applied before or after preproc function.
+                Default: ``True``
+
+            attack_kwargs (dict, optional): Additional arguments to be provided to
+                given attack. This should be provided as a dictionary of keyword
+                arguments.
+                Default: ``None``
+
+            additional_attack_arg_names (list of str, optional): Any additional
+                arguments for the attack which are specific to the particular input
+                example or batch. An example of this is target, which is necessary
+                for some attacks such as FGSM or PGD. These arguments are included
+                if provided as a kwarg to evaluate.
+                Default: ``None``
         """
         if name is None:
             name = attack.__class__.__name__
@@ -259,7 +267,7 @@ def evaluate(
                 For a tensor, the first dimension of the tensor must
                 correspond to the number of examples. For all other types,
                 the given argument is used for all forward evaluations.
-                Default: None
+                Default: ``None``
         perturbations_per_eval (int, optional): Allows perturbations of multiple
                 attacks to be grouped and evaluated in one call of forward_fn
                 Each forward pass will contain a maximum of
@@ -272,9 +280,10 @@ def evaluate(
                 In order to apply this functionality, the output of preproc_fn
                 (or inputs itself if no preproc_fn is provided) must be a tensor
                 or tuple of tensors.
-                Default: 1
+                Default: ``1``
         kwargs (Any, optional): Additional keyword arguments provided to metric function
-                as well as selected attacks based on chosen additional_args
+                as well as selected attacks based on chosen additional_args.
+                Default: ``None``
 
         Returns:
 
diff --git a/captum/robust/_core/metrics/min_param_perturbation.py b/captum/robust/_core/metrics/min_param_perturbation.py
index 92b54d8ff5..95b2897a08 100644
--- a/captum/robust/_core/metrics/min_param_perturbation.py
+++ b/captum/robust/_core/metrics/min_param_perturbation.py
@@ -85,23 +85,23 @@ def __init__(
             arg_step (int, float): Minimum interval for increase of target variable.
 
             mode (str, optional): Mode for search of minimum attack value;
-                either 'linear' for linear search on variable, or 'binary' for
+                either ``linear`` for linear search on variable, or ``binary`` for
                 binary search of variable
-                Default: 'linear'
+                Default: ``linear``
 
             num_attempts (int, optional): Number of attempts or trials with
                 given variable. This should only be set to > 1 for non-deterministic
                 perturbation / attack functions
-                Default: 1
+                Default: ``1``
 
             preproc_fn (Callable, optional): Optional method applied to inputs. Output
                 of preproc_fn is then provided as input to model, in addition to
                 additional_forward_args provided to evaluate.
-                Default: None
+                Default: ``None``
 
             apply_before_preproc (bool, optional): Defines whether attack should be
                 applied before or after preproc function.
-                Default: False
+                Default: ``False``
 
             correct_fn (Callable, optional): This determines whether the perturbed input
                 leads to a correct or incorrect prediction. By default, this function
@@ -114,13 +114,15 @@ def __init__(
                 function must be provided which determines correctness.
 
                 The first argument to this function must be the model out;
-                any additional arguments should be provided through correct_fn_kwargs.
+                any additional arguments should be provided through
+                ``correct_fn_kwargs``.
 
                 This function should have the following signature:
+
                     def correct_fn(model_out: Tensor, **kwargs: Any) -> bool
 
                 Method should return a boolean if correct (True) and incorrect (False).
-                Default: None (applies standard correct_fn for classification)
+                Default: ``None`` (applies standard correct_fn for classification)
         """
         self.forward_func = forward_func
         self.attack = attack
@@ -375,9 +377,9 @@ def evaluate(
                     For a tensor, the first dimension of the tensor must
                     correspond to the number of examples. For all other types,
                     the given argument is used for all forward evaluations.
-                    Default: None
+                    Default: ``None``
             target (TargetType): Target class for classification. This is required if
-                using the default correct_fn
+                using the default ``correct_fn``.
 
             perturbations_per_eval (int, optional): Allows perturbations of multiple
                     attacks to be grouped and evaluated in one call of forward_fn
@@ -391,7 +393,7 @@ def evaluate(
                     In order to apply this functionality, the output of preproc_fn
                     (or inputs itself if no preproc_fn is provided) must be a tensor
                     or tuple of tensors.
-                    Default: 1
+                    Default: ``1``
             attack_kwargs (dict, optional): Optional dictionary of keyword
                     arguments provided to attack function
             correct_fn_kwargs (dict, optional): Optional dictionary of keyword
diff --git a/captum/robust/_core/pgd.py b/captum/robust/_core/pgd.py
index 6b4b01b749..056fe9fe1b 100644
--- a/captum/robust/_core/pgd.py
+++ b/captum/robust/_core/pgd.py
@@ -31,8 +31,7 @@ class PGD(Perturbation):
         x_(t+1) = Clip_r(x_t - alpha * sign(gradient of L(theta, x, t)))
 
     More details on Projected Gradient Descent can be found in the original
-    paper:
-    https://arxiv.org/abs/1706.06083
+    paper: https://arxiv.org/abs/1706.06083
     """
 
     def __init__(
@@ -51,8 +50,10 @@ def __init__(
                         model and labels, and return the loss for each input tensor.
                         The default loss function is negative log.
             lower_bound (float, optional): Lower bound of input values.
+                        Default: ``float("-inf")``
             upper_bound (float, optional): Upper bound of input values.
                         e.g. image pixels must be in the range 0-255
+                        Default: ``float("inf")``
 
         Attributes:
             bound (Callable): A function that bounds the input values based on
@@ -123,14 +124,14 @@ def perturb(
                         which attributions should not be computed, this argument
                         can be provided. These arguments are provided to
                         forward_func in order following the arguments in inputs.
-                        Default: None.
+                        Default: ``None``
             targeted (bool, optional): If attack should be targeted.
-                        Default: False.
+                        Default: ``False``
             random_start (bool, optional): If a random initialization is added to
-                        inputs. Default: False.
+                        inputs. Default: ``False``
             norm (str, optional): Specifies the norm to calculate distance from
-                        original inputs: 'Linf'|'L2'.
-                        Default: 'Linf'.
+                        original inputs: ``Linf`` | ``L2``.
+                        Default: ``Linf``
 
         Returns:
 

From 7f0457bacb08e85488f028b8a977287a8afd4a18 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sat, 23 Jul 2022 09:20:17 -0600
Subject: [PATCH 41/84] Add type improvements to `conf.py`

---
 sphinx/source/conf.py | 63 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index 27bdc763fd..ce4667fa51 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -10,7 +10,9 @@
 # -- Path setup --------------------------------------------------------------
 
 import os
+import re
 import sys
+from typing import List
 
 base_path = os.path.abspath(os.path.join(__file__, "..", "..", ".."))
 # read module from src instead of installation
@@ -201,3 +203,64 @@
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = True
+
+
+# -- Docstring Improvements --------------------------------------------------
+
+
+def autodoc_process_docstring(
+    app, what: str, name: str, obj, options, lines: List[str]
+) -> None:
+    """
+    Modify docstrings before creating html files.
+
+    Sphinx converts the 'Args:' and 'Returns:' sections of docstrings into
+    reStructuredText (rST) syntax, which can then be found via ':type' & ':rtype'.
+
+    See here for more information:
+    https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html
+    """
+    for i in range(len(lines)):
+        # Skip unless line is an parameter doc or a return doc
+        if not (lines[i].startswith(":type") or lines[i].startswith(":rtype")):
+            continue
+
+        # Change "nn.Module" to "torch.nn.Module" in doc type hints for intersphinx
+        lines[i] = re.sub(r"\bnn.Module\b", "torch.nn.Module", lines[i])
+        lines[i] = lines[i].replace("torch.torch.", "torch.")
+
+        # Ensure nn.Module and torch.Tensor are hyperlinked
+        lines[i] = re.sub(r"\btorch.nn.Module\b", ":obj:`torch.nn.Module`", lines[i])
+        lines[i] = re.sub(r"\btorch.Tensor\b", ":obj:`torch.Tensor`", lines[i])
+
+        # Handle Any & Callable types
+        lines[i] = re.sub(r"\bAny\b", ":obj:`Any <typing.Any>`", lines[i])
+        lines[i] = re.sub(
+            r"\bCallable\b", ":obj:`Callable <typing.Callable>`", lines[i]
+        )
+
+        # Handle list & tuple types
+        lines[i] = re.sub(r"\blist\b", ":obj:`list`", lines[i])
+        lines[i] = re.sub(r"\btuple\b", ":obj:`tuple`", lines[i])
+
+        # Handle str & slice types
+        lines[i] = re.sub(r"\bstr\b", ":obj:`str`", lines[i])
+        lines[i] = re.sub(r"\bslice\b", ":obj:`slice`", lines[i])
+
+        # Handle int & float types
+        lines[i] = re.sub(r"\bint\b", ":obj:`int`", lines[i])
+        lines[i] = re.sub(r"\bfloat\b", ":obj:`float`", lines[i])
+
+        # Handle tensor types that are using lowercase
+        # Bolding return types doesn't work with Sphinx hyperlinks
+        lines[i] = lines[i].replace("*tensors*", "tensors")
+        lines[i] = lines[i].replace("*tensor*", "tensor")
+        lines[i] = re.sub(r"\btensor\b", ":class:`tensor <torch.Tensor>`", lines[i])
+        lines[i] = re.sub(r"\btensors\b", ":class:`tensors <torch.Tensor>`", lines[i])
+
+        # Handle None type
+        lines[i] = re.sub(r"\bNone\b", ":obj:`None`", lines[i])
+
+
+def setup(app) -> None:
+    app.connect("autodoc-process-docstring", autodoc_process_docstring)

From 7b9156e890d8a167abad8751ecd27ded2b42e403 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 24 Jul 2022 18:01:39 -0600
Subject: [PATCH 42/84] Set autodoc_preserve_defaults to True

---
 sphinx/source/conf.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index ce4667fa51..737c3d4c31 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -77,6 +77,11 @@
 # Inlcude init docstrings into body of autoclass directives
 autoclass_content = "both"
 
+# Preserve signature defaults
+# Prevents entire tensors from being printed, & gives callable functions
+# proper names
+autodoc_preserve_defaults = True
+
 # Configuration for intersphinx: refer to the Python standard library and PyTorch
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),

From 512e2d49851803aea5eb78fc7f872809df716bcf Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 25 Jul 2022 14:47:11 -0600
Subject: [PATCH 43/84] Escape '.' in regex str replacement

---
 sphinx/source/conf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index 737c3d4c31..b64633468c 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -231,12 +231,12 @@ def autodoc_process_docstring(
             continue
 
         # Change "nn.Module" to "torch.nn.Module" in doc type hints for intersphinx
-        lines[i] = re.sub(r"\bnn.Module\b", "torch.nn.Module", lines[i])
+        lines[i] = re.sub(r"\bnn\.Module\b", "torch.nn.Module", lines[i])
         lines[i] = lines[i].replace("torch.torch.", "torch.")
 
         # Ensure nn.Module and torch.Tensor are hyperlinked
-        lines[i] = re.sub(r"\btorch.nn.Module\b", ":obj:`torch.nn.Module`", lines[i])
-        lines[i] = re.sub(r"\btorch.Tensor\b", ":obj:`torch.Tensor`", lines[i])
+        lines[i] = re.sub(r"\btorch\.nn\.Module\b", ":obj:`torch.nn.Module`", lines[i])
+        lines[i] = re.sub(r"\btorch\.Tensor\b", ":obj:`torch.Tensor`", lines[i])
 
         # Handle Any & Callable types
         lines[i] = re.sub(r"\bAny\b", ":obj:`Any <typing.Any>`", lines[i])

From 46acbb1e3926b674ac20bb9b4f0c21b9baa496a4 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 25 Jul 2022 15:41:05 -0600
Subject: [PATCH 44/84] Fix docstring type

---
 captum/attr/_models/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/captum/attr/_models/base.py b/captum/attr/_models/base.py
index 25ed8469fb..3a62281d08 100644
--- a/captum/attr/_models/base.py
+++ b/captum/attr/_models/base.py
@@ -57,7 +57,7 @@ def forward(self, *inputs, **kwargs):
 
         Returns:
 
-           embedding_tensor (Tensor):
+           embedding_tensor (tensor):
                    Returns a tensor which is the same as first argument passed
                    to the forward function.
                    It passes pre-computed embedding tensors to lower layers

From 646302f57430a7b8f6a909a8e9394e6a6ce65dc5 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 25 Jul 2022 17:58:11 -0600
Subject: [PATCH 45/84] Add Sphinx refs to docstrings

---
 captum/attr/_core/feature_permutation.py         | 16 +++++++++-------
 captum/attr/_core/gradient_shap.py               |  2 +-
 captum/attr/_core/layer/layer_gradient_shap.py   |  2 +-
 captum/attr/_core/neuron/neuron_gradient_shap.py |  2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
index 78f1e3cf80..2b705cc476 100644
--- a/captum/attr/_core/feature_permutation.py
+++ b/captum/attr/_core/feature_permutation.py
@@ -101,10 +101,12 @@ def attribute(  # type: ignore
         **kwargs: Any,
     ) -> TensorOrTupleOfTensorsGeneric:
         r"""
-        This function is almost equivalent to `FeatureAblation.attribute`. The
-        main difference is the way ablated examples are generated. Specifically
-        they are generated through the `perm_func`, as we set the baselines for
-        `FeatureAblation.attribute` to None.
+        This function is almost equivalent to
+        :func:`FeatureAblation.attribute <captum.attr.FeatureAblation.attribute>`. The
+        main difference is the way ablated examples are generated. Specifically they
+        are generated through the ``perm_func``, as we set the baselines for
+        :func:`FeatureAblation.attribute <captum.attr.FeatureAblation.attribute>` to
+        ``None``.
 
 
         Args:
@@ -196,9 +198,9 @@ def attribute(  # type: ignore
                             a simple output of progress.
                             Default: False
                 **kwargs (Any, optional): Any additional arguments used by child
-                            classes of FeatureAblation (such as Occlusion) to construct
-                            ablations. These arguments are ignored when using
-                            FeatureAblation directly.
+                            classes of :class:`.FeatureAblation` (such as
+                            :class:`.Occlusion`) to construct ablations. These
+                            arguments are ignored when using FeatureAblation directly.
                             Default: None
 
         Returns:
diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 47bb7b3955..f074e4a938 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -50,7 +50,7 @@ class GradientShap(GradientAttribution):
     In some sense it can be viewed as an approximation of integrated gradients
     by computing the expectations of gradients for different baselines.
 
-    Current implementation uses Smoothgrad from `NoiseTunnel` in order to
+    Current implementation uses Smoothgrad from :class:`.NoiseTunnel` in order to
     randomly draw samples from the distribution of baselines, add noise to input
     samples and compute the expectation (smoothgrad).
     """
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index 54bbf6613c..851bebc5ab 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -52,7 +52,7 @@ class LayerGradientShap(LayerAttribution, GradientAttribution):
     In some sense it can be viewed as an approximation of integrated gradients
     by computing the expectations of gradients for different baselines.
 
-    Current implementation uses Smoothgrad from `NoiseTunnel` in order to
+    Current implementation uses Smoothgrad from :class:`.NoiseTunnel` in order to
     randomly draw samples from the distribution of baselines, add noise to input
     samples and compute the expectation (smoothgrad).
     """
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index ccf5524472..503b471e94 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -41,7 +41,7 @@ class NeuronGradientShap(NeuronAttribution, GradientAttribution):
     In some sense it can be viewed as an approximation of integrated gradients
     by computing the expectations of gradients for different baselines.
 
-    Current implementation uses Smoothgrad from `NoiseTunnel` in order to
+    Current implementation uses Smoothgrad from :class:`.NoiseTunnel` in order to
     randomly draw samples from the distribution of baselines, add noise to input
     samples and compute the expectation (smoothgrad).
     """

From 55ec6b5c21454e0cb4f92923eb7cf28c2d11ac97 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 25 Jul 2022 19:11:41 -0600
Subject: [PATCH 46/84] Fix NoiseTunnel docstring research paper list

---
 captum/attr/_core/noise_tunnel.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index b4dc561256..608ea4e4d1 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -43,10 +43,11 @@ class NoiseTunnel(Attribution):
     returned.
 
     More details about adding noise can be found in the following papers:
-        https://arxiv.org/abs/1810.03292
-        https://arxiv.org/abs/1810.03307
-        https://arxiv.org/abs/1706.03825
-        https://arxiv.org/abs/1806.10758
+
+        * https://arxiv.org/abs/1810.03292
+        * https://arxiv.org/abs/1810.03307
+        * https://arxiv.org/abs/1706.03825
+        * https://arxiv.org/abs/1806.10758
 
     This method currently also supports batches of multiple examples input,
     however it can be computationally expensive depending on the model,

From 262e9eacaf5e8aa9129443711e7a2954f14f9442 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Tue, 26 Jul 2022 15:09:34 -0600
Subject: [PATCH 47/84] Spelling fixes

---
 captum/attr/_core/deep_lift.py                            | 2 +-
 captum/attr/_core/feature_ablation.py                     | 2 +-
 captum/attr/_core/integrated_gradients.py                 | 2 +-
 captum/attr/_core/layer/layer_conductance.py              | 2 +-
 captum/attr/_core/layer/layer_deep_lift.py                | 2 +-
 captum/attr/_core/layer/layer_integrated_gradients.py     | 4 ++--
 captum/attr/_core/layer/layer_lrp.py                      | 2 +-
 captum/attr/_core/lime.py                                 | 4 ++--
 captum/attr/_core/neuron/neuron_deep_lift.py              | 4 ++--
 captum/attr/_core/neuron/neuron_feature_ablation.py       | 2 +-
 captum/attr/_core/neuron/neuron_gradient.py               | 2 +-
 captum/attr/_core/neuron/neuron_gradient_shap.py          | 4 ++--
 .../attr/_core/neuron/neuron_guided_backprop_deconvnet.py | 4 ++--
 captum/attr/_core/neuron/neuron_integrated_gradients.py   | 2 +-
 captum/attr/_core/noise_tunnel.py                         | 2 +-
 captum/attr/_utils/attribution.py                         | 8 ++++----
 captum/concept/_utils/classifier.py                       | 2 +-
 captum/metrics/_core/sensitivity.py                       | 8 ++++----
 captum/robust/_core/fgsm.py                               | 6 +++---
 captum/robust/_core/perturbation.py                       | 2 +-
 captum/robust/_core/pgd.py                                | 2 +-
 21 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 47b220358e..7ac7b7a7a9 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -303,7 +303,7 @@ def attribute(  # type: ignore
                 based on DeepLift's rescale rule.
                 Delta is calculated per example, meaning that the number of
                 elements in returned delta tensor is equal to the number of
-                of examples in input.
+                examples in input.
                 Note that the logic described for deltas is guaranteed when the
                 default logic for attribution computations is used, meaning that the
                 `custom_attribution_func=None`, otherwise it is not guaranteed and
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index 11dd7bf89b..f28745268a 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -415,7 +415,7 @@ def _ith_input_ablation_generator(
         **kwargs,
     ):
         """
-        This method return an generator of ablation perturbations of the i-th input
+        This method returns a generator of ablation perturbations of the i-th input
 
         Returns:
             ablation_iter (generator): yields each perturbation to be evaluated
diff --git a/captum/attr/_core/integrated_gradients.py b/captum/attr/_core/integrated_gradients.py
index e800bb1b3c..d7f16edeb6 100644
--- a/captum/attr/_core/integrated_gradients.py
+++ b/captum/attr/_core/integrated_gradients.py
@@ -249,7 +249,7 @@ def attribute(  # type: ignore
                     integrated gradient.
                     Delta is calculated per example, meaning that the number of
                     elements in returned delta tensor is equal to the number of
-                    of examples in inputs.
+                    examples in inputs.
 
         Examples::
 
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index bbeeed5795..6a09dd831a 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -253,7 +253,7 @@ def attribute(
                         the total sum of the attributions.
                         Delta is calculated per example, meaning that the number of
                         elements in returned delta tensor is equal to the number of
-                        of examples in inputs.
+                        examples in inputs.
 
         Examples::
 
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index 42a9e359c0..6833e96252 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -272,7 +272,7 @@ def attribute(
                 rescale rule.
                 Delta is calculated per example, meaning that the number of
                 elements in returned delta tensor is equal to the number of
-                of examples in input.
+                examples in input.
                 Note that the logic described for deltas is guaranteed
                 when the default logic for attribution computations is used,
                 meaning that the `custom_attribution_func=None`, otherwise
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index eb2d0762c2..795e73a06e 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -100,7 +100,7 @@ def __init__(
         if isinstance(layer, list) and len(layer) > 1:
             warnings.warn(
                 "Multiple layers provided. Please ensure that each layer is"
-                "**not** solely solely dependent on the outputs of"
+                "**not** solely dependent on the outputs of"
                 "another layer. Please refer to the documentation for more"
                 "detail."
             )
@@ -338,7 +338,7 @@ def attribute(
                         integrated gradient.
                         Delta is calculated per example, meaning that the number of
                         elements in returned delta tensor is equal to the number of
-                        of examples in inputs.
+                        examples in inputs.
 
             Examples::
 
diff --git a/captum/attr/_core/layer/layer_lrp.py b/captum/attr/_core/layer/layer_lrp.py
index 2f82c199b2..f1ac39632f 100644
--- a/captum/attr/_core/layer/layer_lrp.py
+++ b/captum/attr/_core/layer/layer_lrp.py
@@ -194,7 +194,7 @@ def attribute(
                         returned if return_convergence_delta=True):
                         Delta is calculated per example, meaning that the number of
                         elements in returned delta tensor is equal to the number of
-                        of examples in input.
+                        examples in input.
                         If attributions for all layers are returned (layer=None) a list
                         of tensors is returned with entries for
                         each layer.
diff --git a/captum/attr/_core/lime.py b/captum/attr/_core/lime.py
index 77150a3f94..8391431712 100644
--- a/captum/attr/_core/lime.py
+++ b/captum/attr/_core/lime.py
@@ -569,7 +569,7 @@ def default_from_interp_rep_transform(curr_sample, original_inputs, **kwargs):
     ), "Must provide feature_mask to use default interpretable representation transform"
     assert (
         "baselines" in kwargs
-    ), "Must provide baselines to use default interpretable representation transfrom"
+    ), "Must provide baselines to use default interpretable representation transform"
     feature_mask = kwargs["feature_mask"]
     if isinstance(feature_mask, Tensor):
         binary_mask = curr_sample[0][feature_mask].bool()
@@ -767,7 +767,7 @@ def __init__(
                     This is often referred to as a similarity kernel.
 
                     This argument is optional and defaults to a function which
-                    applies an exponential kernel to the consine distance between
+                    applies an exponential kernel to the cosine distance between
                     the original input and perturbed input, with a kernel width
                     of 1.0.
 
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index 43a1b96a27..b743e48caa 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -120,7 +120,7 @@ def attribute(
                           indexed output tensor is used for attribution. Note
                           that specifying a slice of a tensor would amount to
                           computing the attribution of the sum of the specified
-                          neurons, and not the individual neurons independantly.
+                          neurons, and not the individual neurons independently.
 
                         - a callable, which should
                           take the target layer as input (single tensor or tuple
@@ -365,7 +365,7 @@ def attribute(
                           indexed output tensor is used for attribution. Note
                           that specifying a slice of a tensor would amount to
                           computing the attribution of the sum of the specified
-                          neurons, and not the individual neurons independantly.
+                          neurons, and not the individual neurons independently.
 
                         - a callable, which should
                           take the target layer as input (single tensor or tuple
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index 61edd9f418..8790697cb2 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -96,7 +96,7 @@ def attribute(
                           indexed output tensor is used for attribution. Note
                           that specifying a slice of a tensor would amount to
                           computing the attribution of the sum of the specified
-                          neurons, and not the individual neurons independantly.
+                          neurons, and not the individual neurons independently.
 
                         - a callable, which should
                           take the target layer as input (single tensor or tuple
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index 480bc4f6e1..9c817a8a7b 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -93,7 +93,7 @@ def attribute(
                           indexed output tensor is used for attribution. Note
                           that specifying a slice of a tensor would amount to
                           computing the attribution of the sum of the specified
-                          neurons, and not the individual neurons independantly.
+                          neurons, and not the individual neurons independently.
 
                         - a callable, which should
                           take the target layer as input (single tensor or tuple
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index 503b471e94..cd2093bcd7 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -60,7 +60,7 @@ def __init__(
                         modification of it
             layer (torch.nn.Module): Layer for which neuron attributions are computed.
                         The output size of the attribute method matches the
-                        dimensions of the inputs or ouputs of the neuron with
+                        dimensions of the inputs or outputs of the neuron with
                         index `neuron_selector` in this layer, depending on whether
                         we attribute to the inputs or outputs of the neuron.
                         Currently, it is assumed that the inputs or the outputs
@@ -135,7 +135,7 @@ def attribute(
                           indexed output tensor is used for attribution. Note
                           that specifying a slice of a tensor would amount to
                           computing the attribution of the sum of the specified
-                          neurons, and not the individual neurons independantly.
+                          neurons, and not the individual neurons independently.
 
                         - a callable, which should
                           take the target layer as input (single tensor or tuple
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 1a96ce7497..98d55f4b0b 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -99,7 +99,7 @@ def attribute(
                           indexed output tensor is used for attribution. Note
                           that specifying a slice of a tensor would amount to
                           computing the attribution of the sum of the specified
-                          neurons, and not the individual neurons independantly.
+                          neurons, and not the individual neurons independently.
 
                         - a callable, which should
                           take the target layer as input (single tensor or tuple
@@ -268,7 +268,7 @@ def attribute(
                           indexed output tensor is used for attribution. Note
                           that specifying a slice of a tensor would amount to
                           computing the attribution of the sum of the specified
-                          neurons, and not the individual neurons independantly.
+                          neurons, and not the individual neurons independently.
 
                         - a callable, which should
                           take the target layer as input (single tensor or tuple
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index 0c29f1943b..4ea4d333d6 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -113,7 +113,7 @@ def attribute(
                           indexed output tensor is used for attribution. Note
                           that specifying a slice of a tensor would amount to
                           computing the attribution of the sum of the specified
-                          neurons, and not the individual neurons independantly.
+                          neurons, and not the individual neurons independently.
 
                         - a callable, which should
                           take the target layer as input (single tensor or tuple
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 608ea4e4d1..9cd903b8bf 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -168,7 +168,7 @@ def attribute(
             >>> nt = NoiseTunnel(ig)
             >>> # Generates 10 perturbed input tensors per image.
             >>> # Computes integrated gradients for class 3 for each generated
-            >>> # input and averages attributions accros all 10
+            >>> # input and averages attributions across all 10
             >>> # perturbed inputs per image
             >>> attribution = nt.attribute(input, nt_type='smoothgrad',
             >>>                            nt_samples=10, target=3)
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index 2c66481870..60bb8da663 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -50,7 +50,7 @@ def __init__(self, forward_func: Callable) -> None:
         inputs (tensor or tuple of tensors): Input for which attribution
                     is computed. It can be provided as a single tensor or
                     a tuple of multiple tensors. If multiple input tensors
-                    are provided, the batch sizes must be aligned accross all
+                    are provided, the batch sizes must be aligned across all
                     tensors.
 
 
@@ -186,7 +186,7 @@ def compute_convergence_delta(
 
                 attributions (tensor or tuple of tensors): Precomputed attribution
                             scores. The user can compute those using any attribution
-                            algorithm. It is assumed the the shape and the
+                            algorithm. It is assumed the shape and the
                             dimensionality of attributions must match the shape and
                             the dimensionality of `start_point` and `end_point`.
                             It also assumes that the attribution tensor's
@@ -351,7 +351,7 @@ def __init__(
 
 class LayerAttribution(InternalAttribution):
     r"""
-    Layer attribution provides attribution values for the given layer, quanitfying
+    Layer attribution provides attribution values for the given layer, quantifying
     the importance of each neuron within the given layer's output. The output
     attribution of calling attribute on a LayerAttribution object always matches
     the size of the layer output.
@@ -418,7 +418,7 @@ def interpolate(
 
 class NeuronAttribution(InternalAttribution):
     r"""
-    Neuron attribution provides input attribution for a given neuron, quanitfying
+    Neuron attribution provides input attribution for a given neuron, quantifying
     the importance of each input feature in the activation of a particular neuron.
     Calling attribute on a NeuronAttribution object requires also providing
     the index of the neuron in the output of the given layer for which attributions
diff --git a/captum/concept/_utils/classifier.py b/captum/concept/_utils/classifier.py
index 5bdf605470..73092edda8 100644
--- a/captum/concept/_utils/classifier.py
+++ b/captum/concept/_utils/classifier.py
@@ -189,7 +189,7 @@ def weights(self) -> Tensor:
         r"""
         This function returns a C x F tensor weights, where
         C is the number of classes and F is the number of features.
-        In case of binary classification, C = 2 othewise it is > 2.
+        In case of binary classification, C = 2 otherwise it is > 2.
 
         Returns:
             weights (tensor): A torch Tensor with the weights resulting from
diff --git a/captum/metrics/_core/sensitivity.py b/captum/metrics/_core/sensitivity.py
index 1f8d987a67..1b58893039 100644
--- a/captum/metrics/_core/sensitivity.py
+++ b/captum/metrics/_core/sensitivity.py
@@ -31,7 +31,7 @@ def default_perturb_func(
     Args:
 
         inputs (tensor or a tuple of tensors): The input tensors that we'd
-                like to perturb by adding a random noise sampled unifromly
+                like to perturb by adding a random noise sampled uniformly
                 random from an L_infinity ball with a radius `perturb_radius`.
 
         radius (float): A radius used for sampling from
@@ -39,8 +39,8 @@ def default_perturb_func(
 
     Returns:
 
-        perturbed_input (tuple(tensor)): A list of perturbed inputs that
-                are createed by adding noise sampled uniformly random
+        perturbed_input (tuple of tensor): A list of perturbed inputs that
+                are created by adding noise sampled uniformly random
                 from L_infiniy ball with a radius `perturb_radius` to the
                 original inputs.
 
@@ -138,7 +138,7 @@ def sensitivity_max(
         perturb_radius (float, optional): The epsilon radius used for sampling.
             In the `default_perturb_func` it is used as the radius of
             the L-Infinity ball. In a general case it can serve as a radius of
-            any L_p nom.
+            any L_p norm.
             This argument is passed to `perturb_func` if it takes more than
             one argument.
 
diff --git a/captum/robust/_core/fgsm.py b/captum/robust/_core/fgsm.py
index f7f0b2670d..be6f3a474c 100644
--- a/captum/robust/_core/fgsm.py
+++ b/captum/robust/_core/fgsm.py
@@ -21,7 +21,7 @@
 
 class FGSM(Perturbation):
     r"""
-    Fast Gradient Sign Method is an one-step method that can generate
+    Fast Gradient Sign Method is a one-step method that can generate
     adversarial examples.
 
     For non-targeted attack, the formulation is::
@@ -91,7 +91,7 @@ def perturb(
                         attack is computed. It can be provided as a single
                         tensor or a tuple of multiple tensors. If multiple
                         input tensors are provided, the batch sizes must be
-                        aligned accross all tensors.
+                        aligned across all tensors.
             epsilon (float): Step size of perturbation.
             target (Any): True labels of inputs if non-targeted attack is
                         desired. Target class of inputs if targeted attack
@@ -175,7 +175,7 @@ def _perturb(
         r"""
         A helper function to calculate the perturbed inputs given original
         inputs, gradient of loss function and epsilon. The calculation is
-        different for targetd v.s. non-targeted as described above.
+        different for targeted v.s. non-targeted as described above.
         """
         multiplier = -1 if targeted else 1
         inputs = tuple(
diff --git a/captum/robust/_core/perturbation.py b/captum/robust/_core/perturbation.py
index 9eb6d53481..76129d4749 100644
--- a/captum/robust/_core/perturbation.py
+++ b/captum/robust/_core/perturbation.py
@@ -21,7 +21,7 @@ class Perturbation:
         inputs (tensor or tuple of tensors): Input for which adversarial attack
                     is computed. It can be provided as a single tensor or
                     a tuple of multiple tensors. If multiple input tensors
-                    are provided, the batch sizes must be aligned accross all
+                    are provided, the batch sizes must be aligned across all
                     tensors.
 
     Returns:
diff --git a/captum/robust/_core/pgd.py b/captum/robust/_core/pgd.py
index 056fe9fe1b..8c22302497 100644
--- a/captum/robust/_core/pgd.py
+++ b/captum/robust/_core/pgd.py
@@ -87,7 +87,7 @@ def perturb(
                         attack is computed. It can be provided as a single
                         tensor or a tuple of multiple tensors. If multiple
                         input tensors are provided, the batch sizes must be
-                        aligned accross all tensors.
+                        aligned across all tensors.
             radius (float): Radius of the neighbor ball centered around inputs.
                         The perturbation should be within this range.
             step_size (float): Step size of each gradient step.

From b2757621145a55acd23cfc8b40916b01dca2c0a2 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Wed, 27 Jul 2022 11:25:27 -0600
Subject: [PATCH 48/84] Add missing function to Sphinx API docs

---
 sphinx/source/conf.py       | 3 ++-
 sphinx/source/utilities.rst | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index b64633468c..cd81379011 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -248,8 +248,9 @@ def autodoc_process_docstring(
         lines[i] = re.sub(r"\blist\b", ":obj:`list`", lines[i])
         lines[i] = re.sub(r"\btuple\b", ":obj:`tuple`", lines[i])
 
-        # Handle str & slice types
+        # Handle str, bool, & slice types
         lines[i] = re.sub(r"\bstr\b", ":obj:`str`", lines[i])
+        lines[i] = re.sub(r"\bbool\b", ":obj:`bool`", lines[i])
         lines[i] = re.sub(r"\bslice\b", ":obj:`slice`", lines[i])
 
         # Handle int & float types
diff --git a/sphinx/source/utilities.rst b/sphinx/source/utilities.rst
index f4e3d7ace6..a19e75df9e 100644
--- a/sphinx/source/utilities.rst
+++ b/sphinx/source/utilities.rst
@@ -8,6 +8,8 @@ Visualization
 
 .. autofunction:: captum.attr.visualization.visualize_image_attr_multiple
 
+.. autofunction:: captum.attr.visualization.visualize_timeseries_attr
+
 
 Interpretable Embeddings
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -16,6 +18,7 @@ Interpretable Embeddings
     :members:
 
 .. autofunction:: captum.attr.configure_interpretable_embedding_layer
+
 .. autofunction:: captum.attr.remove_interpretable_embedding_layer
 
 

From e0b3281ca7692bc660a94d0d73be17c47d660843 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Wed, 27 Jul 2022 20:15:47 -0600
Subject: [PATCH 49/84] Update conf.py

---
 sphinx/source/conf.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index cd81379011..62c62ff391 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -230,12 +230,7 @@ def autodoc_process_docstring(
         if not (lines[i].startswith(":type") or lines[i].startswith(":rtype")):
             continue
 
-        # Change "nn.Module" to "torch.nn.Module" in doc type hints for intersphinx
-        lines[i] = re.sub(r"\bnn\.Module\b", "torch.nn.Module", lines[i])
-        lines[i] = lines[i].replace("torch.torch.", "torch.")
-
-        # Ensure nn.Module and torch.Tensor are hyperlinked
-        lines[i] = re.sub(r"\btorch\.nn\.Module\b", ":obj:`torch.nn.Module`", lines[i])
+        # Ensure torch.Tensor is hyperlinked
         lines[i] = re.sub(r"\btorch\.Tensor\b", ":obj:`torch.Tensor`", lines[i])
 
         # Handle Any & Callable types
@@ -257,13 +252,6 @@ def autodoc_process_docstring(
         lines[i] = re.sub(r"\bint\b", ":obj:`int`", lines[i])
         lines[i] = re.sub(r"\bfloat\b", ":obj:`float`", lines[i])
 
-        # Handle tensor types that are using lowercase
-        # Bolding return types doesn't work with Sphinx hyperlinks
-        lines[i] = lines[i].replace("*tensors*", "tensors")
-        lines[i] = lines[i].replace("*tensor*", "tensor")
-        lines[i] = re.sub(r"\btensor\b", ":class:`tensor <torch.Tensor>`", lines[i])
-        lines[i] = re.sub(r"\btensors\b", ":class:`tensors <torch.Tensor>`", lines[i])
-
         # Handle None type
         lines[i] = re.sub(r"\bNone\b", ":obj:`None`", lines[i])
 

From 93f24ee5adb4a7ac2b62a92f4b7d1290bcf484b5 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Fri, 29 Jul 2022 18:06:01 -0600
Subject: [PATCH 50/84] Remove `autodoc_preserve_defaults` from `conf.py`

---
 sphinx/source/conf.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index 62c62ff391..947f1e41ec 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -77,11 +77,6 @@
 # Inlcude init docstrings into body of autoclass directives
 autoclass_content = "both"
 
-# Preserve signature defaults
-# Prevents entire tensors from being printed, & gives callable functions
-# proper names
-autodoc_preserve_defaults = True
-
 # Configuration for intersphinx: refer to the Python standard library and PyTorch
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),

From 9fe28276e81f83809a866c5dd4fef778aee8b43b Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sat, 30 Jul 2022 14:18:44 -0600
Subject: [PATCH 51/84] Fix `conf.py` issues

* Prevent extra html tags from being created for `Callable` & `Any`.
* Make string replacements are more strict.
---
 captum/attr/_core/feature_ablation.py         |  2 +-
 captum/attr/_core/feature_permutation.py      |  2 +-
 captum/attr/_core/gradient_shap.py            |  2 +-
 captum/attr/_core/kernel_shap.py              |  2 +-
 captum/attr/_core/layer/layer_activation.py   |  2 +-
 .../layer/layer_gradient_x_activation.py      |  2 +-
 captum/attr/_core/layer/layer_lrp.py          |  2 +-
 captum/attr/_core/occlusion.py                |  2 +-
 captum/attr/_core/saliency.py                 |  2 +-
 sphinx/source/conf.py                         | 49 ++++++++++++++-----
 10 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index f28745268a..62c14b8e8e 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -48,7 +48,7 @@ def __init__(self, forward_func: Callable) -> None:
         Args:
 
             forward_func (Callable): The forward function of the model or
-                        any modification of it
+                        any modification of it.
         """
         PerturbationAttribution.__init__(self, forward_func)
         self.use_weights = False
diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
index 2b705cc476..a848262184 100644
--- a/captum/attr/_core/feature_permutation.py
+++ b/captum/attr/_core/feature_permutation.py
@@ -76,7 +76,7 @@ def __init__(
         Args:
 
             forward_func (Callable): The forward function of the model or
-                any modification of it
+                any modification of it.
             perm_func (Callable, optional): A function that accepts a batch of
                 inputs and a feature mask, and "permutes" the feature using
                 feature mask across the batch. This defaults to a function
diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index f074e4a938..ff41a3ff0d 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -295,7 +295,7 @@ def __init__(self, forward_func: Callable, multiply_by_inputs=True) -> None:
         Args:
 
             forward_func (Callable): The forward function of the model or
-                        any modification of it
+                        any modification of it.
             multiply_by_inputs (bool, optional): Indicates whether to factor
                         model inputs' multiplier in the final attribution scores.
                         In the literature this is also known as local vs global
diff --git a/captum/attr/_core/kernel_shap.py b/captum/attr/_core/kernel_shap.py
index a1151e6f53..d7ddc6c5c3 100644
--- a/captum/attr/_core/kernel_shap.py
+++ b/captum/attr/_core/kernel_shap.py
@@ -30,7 +30,7 @@ def __init__(self, forward_func: Callable) -> None:
         Args:
 
             forward_func (Callable): The forward function of the model or
-                        any modification of it
+                        any modification of it.
         """
         Lime.__init__(
             self,
diff --git a/captum/attr/_core/layer/layer_activation.py b/captum/attr/_core/layer/layer_activation.py
index c3f565d6c7..ea03ae25c7 100644
--- a/captum/attr/_core/layer/layer_activation.py
+++ b/captum/attr/_core/layer/layer_activation.py
@@ -87,7 +87,7 @@ def attribute(
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* or *list* of **attributions**:
+            *tensor* or tuple of *tensors* or list of **attributions**:
             - **attributions** (*tensor* or tuple of *tensors* or *list*):
                         Activation of each neuron in given layer output.
                         Attributions will always be the same size as the
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index dbf0d15992..8f590956f3 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -134,7 +134,7 @@ def attribute(
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* or *list* of **attributions**:
+            *tensor* or tuple of *tensors* or list of **attributions**:
             - **attributions** (*tensor* or tuple of *tensors* or *list*):
                         Product of gradient and activation for each
                         neuron in given layer output.
diff --git a/captum/attr/_core/layer/layer_lrp.py b/captum/attr/_core/layer/layer_lrp.py
index f1ac39632f..56795af994 100644
--- a/captum/attr/_core/layer/layer_lrp.py
+++ b/captum/attr/_core/layer/layer_lrp.py
@@ -176,7 +176,7 @@ def attribute(
 
         Returns:
             *tensor* or tuple of *tensors* of **attributions** or 2-element tuple of
-            **attributions**, **delta** or lists of **attributions** and **delta**:
+            **attributions**, **delta** or list of **attributions** and **delta**:
 
               - **attributions** (*tensor* or tuple of *tensors*):
                         The propagated relevance values with respect to each
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index 7db0157c8c..12bae0c6f9 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -40,7 +40,7 @@ def __init__(self, forward_func: Callable) -> None:
         Args:
 
             forward_func (Callable): The forward function of the model or
-                        any modification of it
+                        any modification of it.
         """
         FeatureAblation.__init__(self, forward_func)
         self.use_weights = True
diff --git a/captum/attr/_core/saliency.py b/captum/attr/_core/saliency.py
index 8b784e7a9c..95feff7478 100644
--- a/captum/attr/_core/saliency.py
+++ b/captum/attr/_core/saliency.py
@@ -28,7 +28,7 @@ def __init__(self, forward_func: Callable) -> None:
         Args:
 
             forward_func (Callable): The forward function of the model or
-                        any modification of it
+                        any modification of it.
         """
         GradientAttribution.__init__(self, forward_func)
 
diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index 947f1e41ec..46a9340e6b 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -77,6 +77,11 @@
 # Inlcude init docstrings into body of autoclass directives
 autoclass_content = "both"
 
+# Preserve signature defaults
+# Prevents entire tensors from being printed, & gives callable functions
+# proper names
+autodoc_preserve_defaults = True
+
 # Configuration for intersphinx: refer to the Python standard library and PyTorch
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),
@@ -208,6 +213,22 @@
 # -- Docstring Improvements --------------------------------------------------
 
 
+def replace_pattern(s: str) -> str:
+    """
+    Wrap a string in regex code so that existing Sphinx formatting is not interfered
+    with. This function ensures that the string will not be replaced if it is preceded
+    by '`' or '<', ends with '>', or is inside square brackets '[' & ']'.
+
+    Args:
+
+        s (str): A string to replace.
+
+    Returns:
+        s (str): The input string wrapped in regex code.
+    """
+    return r"(?<![\[`<])(" + s + r")(?![\]`>])"
+
+
 def autodoc_process_docstring(
     app, what: str, name: str, obj, options, lines: List[str]
 ) -> None:
@@ -224,31 +245,33 @@ def autodoc_process_docstring(
         # Skip unless line is an parameter doc or a return doc
         if not (lines[i].startswith(":type") or lines[i].startswith(":rtype")):
             continue
+        if ":py:data:" in lines[i]:
+            continue
 
         # Ensure torch.Tensor is hyperlinked
-        lines[i] = re.sub(r"\btorch\.Tensor\b", ":obj:`torch.Tensor`", lines[i])
-
-        # Handle Any & Callable types
-        lines[i] = re.sub(r"\bAny\b", ":obj:`Any <typing.Any>`", lines[i])
         lines[i] = re.sub(
-            r"\bCallable\b", ":obj:`Callable <typing.Callable>`", lines[i]
+            replace_pattern(r"\btorch\.Tensor\b"), ":class:`torch.Tensor`", lines[i]
         )
 
+        # Handle Any & Callable types
+        lines[i] = re.sub(r"\bAny\b", ":data:`~typing.Any`", lines[i])
+        lines[i] = re.sub(r"\bCallable\b", ":data:`~typing.Callable`", lines[i])
+
         # Handle list & tuple types
-        lines[i] = re.sub(r"\blist\b", ":obj:`list`", lines[i])
-        lines[i] = re.sub(r"\btuple\b", ":obj:`tuple`", lines[i])
+        lines[i] = re.sub(replace_pattern(r"\blist\b"), ":class:`list`", lines[i])
+        lines[i] = re.sub(replace_pattern(r"\btuple\b"), ":class:`tuple`", lines[i])
 
         # Handle str, bool, & slice types
-        lines[i] = re.sub(r"\bstr\b", ":obj:`str`", lines[i])
-        lines[i] = re.sub(r"\bbool\b", ":obj:`bool`", lines[i])
-        lines[i] = re.sub(r"\bslice\b", ":obj:`slice`", lines[i])
+        lines[i] = re.sub(replace_pattern(r"\bstr\b"), ":class:`str`", lines[i])
+        lines[i] = re.sub(replace_pattern(r"\bbool\b"), ":class:`bool`", lines[i])
+        lines[i] = re.sub(replace_pattern(r"\bslice\b"), ":class:`slice`", lines[i])
 
         # Handle int & float types
-        lines[i] = re.sub(r"\bint\b", ":obj:`int`", lines[i])
-        lines[i] = re.sub(r"\bfloat\b", ":obj:`float`", lines[i])
+        lines[i] = re.sub(replace_pattern(r"\bint\b"), ":class:`int`", lines[i])
+        lines[i] = re.sub(replace_pattern(r"\bfloat\b"), ":class:`float`", lines[i])
 
         # Handle None type
-        lines[i] = re.sub(r"\bNone\b", ":obj:`None`", lines[i])
+        lines[i] = re.sub(replace_pattern(r"\bNone\b"), ":class:`None`", lines[i])
 
 
 def setup(app) -> None:

From c0599c9cae098a9e685f56d1e5fa573ded4a8100 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 31 Jul 2022 14:15:19 -0600
Subject: [PATCH 52/84] Remove the `autodoc_process_docstring` function

---
 sphinx/source/conf.py | 70 -------------------------------------------
 1 file changed, 70 deletions(-)

diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index 46a9340e6b..64db4c5586 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -10,9 +10,7 @@
 # -- Path setup --------------------------------------------------------------
 
 import os
-import re
 import sys
-from typing import List
 
 base_path = os.path.abspath(os.path.join(__file__, "..", "..", ".."))
 # read module from src instead of installation
@@ -208,71 +206,3 @@
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = True
-
-
-# -- Docstring Improvements --------------------------------------------------
-
-
-def replace_pattern(s: str) -> str:
-    """
-    Wrap a string in regex code so that existing Sphinx formatting is not interfered
-    with. This function ensures that the string will not be replaced if it is preceded
-    by '`' or '<', ends with '>', or is inside square brackets '[' & ']'.
-
-    Args:
-
-        s (str): A string to replace.
-
-    Returns:
-        s (str): The input string wrapped in regex code.
-    """
-    return r"(?<![\[`<])(" + s + r")(?![\]`>])"
-
-
-def autodoc_process_docstring(
-    app, what: str, name: str, obj, options, lines: List[str]
-) -> None:
-    """
-    Modify docstrings before creating html files.
-
-    Sphinx converts the 'Args:' and 'Returns:' sections of docstrings into
-    reStructuredText (rST) syntax, which can then be found via ':type' & ':rtype'.
-
-    See here for more information:
-    https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html
-    """
-    for i in range(len(lines)):
-        # Skip unless line is an parameter doc or a return doc
-        if not (lines[i].startswith(":type") or lines[i].startswith(":rtype")):
-            continue
-        if ":py:data:" in lines[i]:
-            continue
-
-        # Ensure torch.Tensor is hyperlinked
-        lines[i] = re.sub(
-            replace_pattern(r"\btorch\.Tensor\b"), ":class:`torch.Tensor`", lines[i]
-        )
-
-        # Handle Any & Callable types
-        lines[i] = re.sub(r"\bAny\b", ":data:`~typing.Any`", lines[i])
-        lines[i] = re.sub(r"\bCallable\b", ":data:`~typing.Callable`", lines[i])
-
-        # Handle list & tuple types
-        lines[i] = re.sub(replace_pattern(r"\blist\b"), ":class:`list`", lines[i])
-        lines[i] = re.sub(replace_pattern(r"\btuple\b"), ":class:`tuple`", lines[i])
-
-        # Handle str, bool, & slice types
-        lines[i] = re.sub(replace_pattern(r"\bstr\b"), ":class:`str`", lines[i])
-        lines[i] = re.sub(replace_pattern(r"\bbool\b"), ":class:`bool`", lines[i])
-        lines[i] = re.sub(replace_pattern(r"\bslice\b"), ":class:`slice`", lines[i])
-
-        # Handle int & float types
-        lines[i] = re.sub(replace_pattern(r"\bint\b"), ":class:`int`", lines[i])
-        lines[i] = re.sub(replace_pattern(r"\bfloat\b"), ":class:`float`", lines[i])
-
-        # Handle None type
-        lines[i] = re.sub(replace_pattern(r"\bNone\b"), ":class:`None`", lines[i])
-
-
-def setup(app) -> None:
-    app.connect("autodoc-process-docstring", autodoc_process_docstring)

From 1b4d23af020d4e4b47a1e8f6e97eef4377f284f2 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 31 Jul 2022 14:41:30 -0600
Subject: [PATCH 53/84] Improve docs

---
 captum/_utils/av.py                            | 2 +-
 captum/attr/_utils/attribution.py              | 2 +-
 captum/influence/_core/similarity_influence.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/captum/_utils/av.py b/captum/_utils/av.py
index ccc9a1763e..f7f3512c84 100644
--- a/captum/_utils/av.py
+++ b/captum/_utils/av.py
@@ -368,7 +368,7 @@ def _compute_and_save_activations(
                     different training batches.
             num_id (str): An required string representing the batch number for which the
                     activation vectors are computed
-            additional_forward_args (optional): Additional arguments that will be
+            additional_forward_args (Any, optional): Additional arguments that will be
                     passed to `model` after inputs.
                     Default: None
             load_from_disk (bool): Forces function to regenerate activations if False.
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index 60bb8da663..0ca4f69d52 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -104,7 +104,7 @@ def has_convergence_delta(self) -> bool:
                         tensor's dimension 0 corresponds to the number of
                         examples, and if multiple input tensors are provided,
                         the examples must be aligned appropriately.
-            *args (optional): Additonal arguments that are used by the
+            *args (Any, optional): Additonal arguments that are used by the
                         sub-classes depending on the specific implementation
                         of `compute_convergence_delta`.
 
diff --git a/captum/influence/_core/similarity_influence.py b/captum/influence/_core/similarity_influence.py
index 6123005a68..28540b0021 100644
--- a/captum/influence/_core/similarity_influence.py
+++ b/captum/influence/_core/similarity_influence.py
@@ -172,7 +172,7 @@ def influence(  # type: ignore[override]
                     to the batch size. A tuple of tensors is only passed in if this
                     is the input form that `module` accepts.
             top_k (int): The number of top-matching activations to return
-            additional_forward_args (optional): Additional arguments that will be
+            additional_forward_args (Any, optional): Additional arguments that will be
                     passed to forward_func after inputs.
             load_src_from_disk (bool): Loads activations for `influence_src_dataset`
                     where possible. Setting to False would force regeneration of

From 1a02e03e746a4b71657a8c4d80db13b3acc8c6ca Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 1 Aug 2022 09:13:55 -0600
Subject: [PATCH 54/84] Fix mistakes

---
 captum/attr/_utils/summarizer.py                  | 2 +-
 captum/influence/_core/tracincp.py                | 6 +++---
 captum/influence/_core/tracincp_fast_rand_proj.py | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/captum/attr/_utils/summarizer.py b/captum/attr/_utils/summarizer.py
index 874e5d263b..f82ed6357a 100644
--- a/captum/attr/_utils/summarizer.py
+++ b/captum/attr/_utils/summarizer.py
@@ -193,7 +193,7 @@ def update(self, x: Tensor):
         Updates the summary of a given tensor `x`
 
         Args:
-            x (Tensor):
+            x (tensor):
                 The tensor to summarize
         """
         for stat in self._stats:
diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index 49ccac10db..04e06aa06b 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -227,7 +227,7 @@ def self_influence(
                     more details on the assumed structure of a batch.
             show_progress (bool, optional): Computation of self influence scores can
                     take a long time if `inputs_dataset` represents many examples. If
-                    `show_progress`is true, the progress of this computation will be
+                    `show_progress` is true, the progress of this computation will be
                     displayed. In more detail, this computation will iterate over all
                     checkpoints (provided as the `checkpoints` initialization argument)
                     in an outer loop, and iterate over all batches that
@@ -980,7 +980,7 @@ def self_influence(
                     more details on the assumed structure of a batch.
             show_progress (bool, optional): Computation of self influence scores can
                     take a long time if `inputs_dataset` represents many examples. If
-                    `show_progress`is true, the progress of this computation will be
+                    `show_progress` is true, the progress of this computation will be
                     displayed. In more detail, this computation will iterate over all
                     checkpoints (provided as the `checkpoints` initialization argument)
                     in an outer loop, and iterate over all batches that
@@ -996,7 +996,7 @@ def self_influence(
                     Default: False
 
         Returns:
-            self_influence_scores (Tensor): This is a 1D tensor containing the self
+            self_influence_scores (tensor): This is a 1D tensor containing the self
                     influence scores of all examples in `inputs_dataset`, regardless of
                     whether it represents a single batch or a `DataLoader` that yields
                     batches.
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index 204eddbc20..fc4d794ed8 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -516,7 +516,7 @@ def self_influence(
                     more details on the assumed structure of a batch.
             show_progress (bool, optional): Computation of self influence scores can
                     take a long time if `inputs_dataset` represents many examples. If
-                    `show_progress`is true, the progress of this computation will be
+                    `show_progress` is true, the progress of this computation will be
                     displayed. In more detail, this computation will iterate over all
                     checkpoints (provided as the `checkpoints` initialization argument)
                     in an outer loop, and iterate over all batches that
@@ -532,7 +532,7 @@ def self_influence(
                     Default: False
 
         Returns:
-            self_influence_scores (Tensor): This is a 1D tensor containing the self
+            self_influence_scores (tensor): This is a 1D tensor containing the self
                     influence scores of all examples in `inputs_dataset`, regardless of
                     whether it represents a single batch or a `DataLoader` that yields
                     batches.
@@ -981,7 +981,7 @@ def self_influence(
                     more details on the assumed structure of a batch.
             show_progress (bool, optional): Computation of self influence scores can
                     take a long time if `inputs_dataset` represents many examples. If
-                    `show_progress`is true, the progress of this computation will be
+                    `show_progress` is true, the progress of this computation will be
                     displayed. In more detail, this computation will iterate over all
                     checkpoints (provided as the `checkpoints` initialization argument)
                     and all batches that `inputs_dataset` represents. Therefore, the
@@ -995,7 +995,7 @@ def self_influence(
                     Default: False
 
         Returns:
-            self_influence_scores (Tensor): This is a 1D tensor containing the self
+            self_influence_scores (tensor): This is a 1D tensor containing the self
                     influence scores of all examples in `inputs_dataset`, regardless of
                     whether it represents a single batch or a `DataLoader` that yields
                     batches.

From e5a2b5d8e5c5972583398187a62838b27635f9aa Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 1 Aug 2022 09:33:45 -0600
Subject: [PATCH 55/84] Improve docs

* `common.rst` hasn't been working at all for years, so I removed it. It also only seemed to contain internal / non public validation functions.
* Added missing approximation methods page to `index.rst`
---
 sphinx/source/approximation_methods.rst |  2 +-
 sphinx/source/common.rst                | 15 ---------------
 sphinx/source/index.rst                 |  1 +
 3 files changed, 2 insertions(+), 16 deletions(-)
 delete mode 100644 sphinx/source/common.rst

diff --git a/sphinx/source/approximation_methods.rst b/sphinx/source/approximation_methods.rst
index b6b197d92e..4deec709bf 100644
--- a/sphinx/source/approximation_methods.rst
+++ b/sphinx/source/approximation_methods.rst
@@ -1,4 +1,4 @@
-Captum Approximation
+Approximation
 ====================
 
 .. automodule:: captum.attr._utils.approximation_methods
diff --git a/sphinx/source/common.rst b/sphinx/source/common.rst
deleted file mode 100644
index 7abf6a382a..0000000000
--- a/sphinx/source/common.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-Captum.Utils
-============
-
-.. automodule:: captum.attr._utils.common
-
-.. autofunction:: _validate_input
-.. autofunction:: _validate_noise_tunnel_type
-.. autofunction:: _reshape_and_sum
-
-.. currentmodule:: captum._utils.common
-
-.. autofunction:: _format_inputs
-.. autofunction:: _format_output
-.. autofunction:: _zeros
-.. autofunction:: _run_forward
diff --git a/sphinx/source/index.rst b/sphinx/source/index.rst
index c54d99c28c..aa67ab6b54 100644
--- a/sphinx/source/index.rst
+++ b/sphinx/source/index.rst
@@ -21,6 +21,7 @@ Captum API Reference
    influence
    utilities
    base_classes
+   approximation_methods
 
 .. toctree::
    :maxdepth: 2

From f7ac15625ca4eba5d9292dc746183af9441f272b Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 1 Aug 2022 13:32:57 -0600
Subject: [PATCH 56/84] Fix docstring types

---
 captum/influence/_core/tracincp.py                | 4 ++--
 captum/influence/_core/tracincp_fast_rand_proj.py | 6 +++---
 captum/influence/_utils/common.py                 | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index f40d3b7894..5390787d84 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -972,7 +972,7 @@ def _self_influence_by_checkpoints(
         times.
 
         Args:
-            batches (Tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple, or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
@@ -1134,7 +1134,7 @@ def self_influence(
         for each batch. For large models, loading checkpoints can be time-intensive.
 
         Args:
-            batches (Tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple, or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index 7cfde0b732..70d32563a9 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -507,7 +507,7 @@ def _self_influence_by_checkpoints(
         times.
 
         Args:
-            batches (Tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple, or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
@@ -651,7 +651,7 @@ def self_influence(
         for each batch. For large models, loading checkpoints can be time-intensive.
 
         Args:
-            batches (Tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple, or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
@@ -1039,7 +1039,7 @@ def self_influence(
         with are not too large, so that there will not be an out-of-memory error.
 
         Args:
-            batches (Tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple, or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
diff --git a/captum/influence/_utils/common.py b/captum/influence/_utils/common.py
index f4fdf3a938..ac9de233c3 100644
--- a/captum/influence/_utils/common.py
+++ b/captum/influence/_utils/common.py
@@ -355,7 +355,7 @@ def _self_influence_by_batches_helper(
         instance_name (str): This is the name of the implementation class that
                 `self_influence_batch_fn` is a method of. This is used for displaying
                 warning messages.
-        batches (Tuple, or DataLoader): Either a single tuple of any, or a
+        batches (tuple, or DataLoader): Either a single tuple of any, or a
                 `DataLoader`, where each batch yielded is a tuple of any. In
                 either case, the tuple represents a single batch, where the last
                 element is assumed to be the labels for the batch. That is,

From 7fca541ca346def2689aa6daf8622a72281f4b54 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Tue, 2 Aug 2022 11:42:00 -0600
Subject: [PATCH 57/84] Improve docstrings

---
 captum/attr/_utils/approximation_methods.py | 17 ++++++-----------
 captum/attr/_utils/class_summarizer.py      |  2 +-
 captum/concept/_core/concept.py             |  1 +
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/captum/attr/_utils/approximation_methods.py b/captum/attr/_utils/approximation_methods.py
index 9d63e90c1a..755e701d6a 100644
--- a/captum/attr/_utils/approximation_methods.py
+++ b/captum/attr/_utils/approximation_methods.py
@@ -28,7 +28,7 @@ def approximation_parameters(
     r"""Retrieves parameters for the input approximation `method`
 
     Args:
-        method: The name of the approximation method. Currently only `riemann`
+        method (str): The name of the approximation method. Currently only `riemann`
                 and gauss legendre are
     """
     if method in SUPPORTED_RIEMANN_METHODS:
@@ -45,17 +45,16 @@ def riemann_builders(
 
     Args:
 
-         n: The number of integration steps
-         method: `left`, `right`, `middle` and `trapezoid` riemann
+         method (Riemann): `left`, `right`, `middle` and `trapezoid` riemann
 
     Returns:
         2-element tuple of **step_sizes**, **alphas**:
-        - **step_sizes** (*callable*):
+        - **step_sizes** (*Callable*):
                     `step_sizes` takes the number of steps as an
                     input argument and returns an array of steps sizes which
                     sum is smaller than or equal to one.
 
-        - **alphas** (*callable*):
+        - **alphas** (*Callable*):
                     `alphas` takes the number of steps as an input argument
                     and returns the multipliers/coefficients for the inputs
                     of integrand in the range of [0, 1]
@@ -104,18 +103,14 @@ def gauss_legendre_builders() -> Tuple[
     proposed by [Xue Feng and her intern Hauroun Habeeb]
     (https://research.fb.com/people/feng-xue/).
 
-    Args:
-
-        n (int): The number of integration steps
-
     Returns:
         2-element tuple of **step_sizes**, **alphas**:
-        - **step_sizes** (*callable*):
+        - **step_sizes** (*Callable*):
                     `step_sizes` takes the number of steps as an
                     input argument and returns an array of steps sizes which
                     sum is smaller than or equal to one.
 
-        - **alphas** (*callable*):
+        - **alphas** (*Callable*):
                     `alphas` takes the number of steps as an input argument
                     and returns the multipliers/coefficients for the inputs
                     of integrand in the range of [0, 1]
diff --git a/captum/attr/_utils/class_summarizer.py b/captum/attr/_utils/class_summarizer.py
index 2485711866..9740674136 100644
--- a/captum/attr/_utils/class_summarizer.py
+++ b/captum/attr/_utils/class_summarizer.py
@@ -36,7 +36,7 @@ def update(  # type: ignore
         This accepts either a single tensor to summarise or a tuple of tensors.
 
         Args:
-            x (Tensor or Tuple[Tensor, ...]):
+            x (tensor or tuple of tensor):
                 The input tensor to be summarised. The first
                 dimension of this input must be associated to
                 the batch size of the inputs.
diff --git a/captum/concept/_core/concept.py b/captum/concept/_core/concept.py
index a550ab8a9d..f4314fd15b 100644
--- a/captum/concept/_core/concept.py
+++ b/captum/concept/_core/concept.py
@@ -35,6 +35,7 @@ def __init__(
                         https://pytorch.org/docs/stable/data.html
 
         Example::
+
             >>> # Creates a Concept object named "striped", with a data_iter
             >>> # object to iterate over all files in "./concepts/striped"
             >>> concept_name = "striped"

From 049bca279e30efadec4e394513d9ca2cf3f1a675 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Tue, 2 Aug 2022 12:05:42 -0600
Subject: [PATCH 58/84] Rename `algorithms.md` to `attribution_algorithms.md`
 as per feedback

---
 README.md                                         | 2 +-
 docs/{algorithms.md => attribution_algorithms.md} | 2 +-
 website/sidebars.json                             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename docs/{algorithms.md => attribution_algorithms.md} (99%)

diff --git a/README.md b/README.md
index 5731b789d2..f223fe84ee 100644
--- a/README.md
+++ b/README.md
@@ -487,7 +487,7 @@ Image Classification Models and Saliency Maps, K. Simonyan, et. al. 2014](https:
 * `Shapley Value Sampling`: [Polynomial calculation of the Shapley value based on sampling](https://www.sciencedirect.com/science/article/pii/S0305054808000804)
 * `Infidelity and Sensitivity`: [On the (In)fidelity and Sensitivity for Explanations](https://arxiv.org/abs/1901.09392)
 
-More details about the above mentioned [algorithms](https://captum.ai/docs/algorithms) and their pros and cons can be found on our [web-site](https://captum.ai/docs/algorithms_comparison_matrix).
+More details about the above mentioned [attribution algorithms](https://captum.ai/docs/attribution_algorithms) and their pros and cons can be found on our [web-site](https://captum.ai/docs/algorithms_comparison_matrix).
 
 ## License
 Captum is BSD licensed, as found in the [LICENSE](LICENSE) file.
diff --git a/docs/algorithms.md b/docs/attribution_algorithms.md
similarity index 99%
rename from docs/algorithms.md
rename to docs/attribution_algorithms.md
index b06a8aa5f1..f1d00a8f53 100644
--- a/docs/algorithms.md
+++ b/docs/attribution_algorithms.md
@@ -1,5 +1,5 @@
 ---
-id: algorithms
+id: attribution_algorithms
 title: Algorithm Descriptions
 ---
 
diff --git a/website/sidebars.json b/website/sidebars.json
index 0337e1bbe9..9efb1fddb2 100644
--- a/website/sidebars.json
+++ b/website/sidebars.json
@@ -1,7 +1,7 @@
 {
   "docs": {
     "About": ["introduction"],
-    "General": ["getting_started", "captum_insights", "algorithms", "algorithms_comparison_matrix", "faq", "contribution_guidelines"],
+    "General": ["getting_started", "captum_insights", "attribution_algorithms", "algorithms_comparison_matrix", "faq", "contribution_guidelines"],
     "Usage": ["extension/integrated_gradients"]
   }
 }

From 0be4dff4b71f66725f48c7d3cf823fac2985a204 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Wed, 3 Aug 2022 09:08:18 -0600
Subject: [PATCH 59/84] Improve docstrings & type hints

---
 captum/_utils/av.py               | 8 +++++---
 captum/concept/_core/tcav.py      | 4 ++--
 captum/influence/_utils/common.py | 8 +++++---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/captum/_utils/av.py b/captum/_utils/av.py
index f7f3512c84..5332345975 100644
--- a/captum/_utils/av.py
+++ b/captum/_utils/av.py
@@ -80,7 +80,7 @@ def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, ...]]:
             av = torch.load(fl)
             return av
 
-        def __len__(self):
+        def __len__(self) -> int:
             return len(self.files)
 
     AV_DIR_NAME: str = "av"
@@ -301,11 +301,13 @@ def _manage_loading_layers(
                     are being computed and stored.
             layers (str or list of str): The layer(s) for which the activation vectors
                     are computed.
+            load_from_disk (bool, optional): Whether or not to load from disk.
+                Default: True
             identifier (str or None): An optional identifier for the layer
                     activations. Can be used to distinguish between activations for
                     different training batches.
-            num_id (str): An optional string representing the batch number for which the
-                    activation vectors are computed
+            num_id (str, optional): An optional string representing the batch number
+                for which the activation vectors are computed.
 
         Returns:
             List of layer names for which activations should be generated
diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index 245aa5816a..8f27f5f3f2 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -71,7 +71,7 @@ def _i_to_k(self, i):
             else:
                 right = mid
 
-    def __getitem__(self, i):
+    def __getitem__(self, i: int):
         """
         Returns a batch of activation vectors, as well as a batch of labels
         indicating which concept the batch of activation vectors is associated
@@ -94,7 +94,7 @@ def __getitem__(self, i):
         labels = torch.tensor([self.labels[k]] * inputs.size(0), device=inputs.device)
         return inputs, labels
 
-    def __len__(self):
+    def __len__(self) -> int:
         """
         returns the total number of batches in the labelled_dataset
         """
diff --git a/captum/influence/_utils/common.py b/captum/influence/_utils/common.py
index ac9de233c3..264fd54c81 100644
--- a/captum/influence/_utils/common.py
+++ b/captum/influence/_utils/common.py
@@ -154,9 +154,11 @@ def _load_flexible_state_dict(
     state_dict and other information.
 
     Args:
-        model: The model for which to load a checkpoint
-        path: The filepath to the checkpoint
-        keyname: The key under which the model state_dict is stored, if any.
+        model: The model for which to load a checkpoint.
+        path (str): The filepath to the checkpoint.
+        device_ids (str, optional): The device to use. Default: "cpu"
+        keyname (str, optional): The key under which the model state_dict is stored,
+            if any.
 
     The module state_dict is modified in-place, and the learning rate is returned.
     """

From fdfa8586b9b4955b19472f30550823efb2fcc49f Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 4 Aug 2022 11:29:56 -0600
Subject: [PATCH 60/84] Don't link directly to arxiv PDFs

* Also fixed GradCAM reference link
---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f223fe84ee..afa2f99d3f 100644
--- a/README.md
+++ b/README.md
@@ -471,16 +471,16 @@ You can watch the recorded talk [here](https://www.youtube.com/watch?v=ayhBHZYje
 * `SmoothGrad`: [SmoothGrad: removing noise by adding noise, Daniel Smilkov et al. 2017](https://arxiv.org/abs/1706.03825)
 * `NoiseTunnel`: [Sanity Checks for Saliency Maps, Julius Adebayo et al. 2018](https://arxiv.org/abs/1810.03292)
 * `NeuronConductance`: [How Important is a neuron?, Kedar Dhamdhere et al. 2018](https://arxiv.org/abs/1805.12233)
-* `LayerConductance`: [Computationally Efficient Measures of Internal Neuron Importance, Avanti Shrikumar et al. 2018](https://arxiv.org/pdf/1807.09946.pdf)
-* `DeepLift`, `NeuronDeepLift`, `LayerDeepLift`: [Learning Important Features Through Propagating Activation Differences, Avanti Shrikumar et al. 2017](https://arxiv.org/pdf/1704.02685.pdf) and [Towards better understanding of gradient-based attribution methods for deep neural networks, Marco Ancona et al. 2018](https://openreview.net/pdf?id=Sy21R9JAW)
-* `NeuronIntegratedGradients`: [Computationally Efficient Measures of Internal Neuron Importance, Avanti Shrikumar et al. 2018](https://arxiv.org/pdf/1807.09946.pdf)
+* `LayerConductance`: [Computationally Efficient Measures of Internal Neuron Importance, Avanti Shrikumar et al. 2018](https://arxiv.org/abs/1807.09946)
+* `DeepLift`, `NeuronDeepLift`, `LayerDeepLift`: [Learning Important Features Through Propagating Activation Differences, Avanti Shrikumar et al. 2017](https://arxiv.org/abs/1704.02685) and [Towards better understanding of gradient-based attribution methods for deep neural networks, Marco Ancona et al. 2018](https://openreview.net/pdf?id=Sy21R9JAW)
+* `NeuronIntegratedGradients`: [Computationally Efficient Measures of Internal Neuron Importance, Avanti Shrikumar et al. 2018](https://arxiv.org/abs/1807.09946)
 * `GradientShap`, `NeuronGradientShap`, `LayerGradientShap`, `DeepLiftShap`, `NeuronDeepLiftShap`, `LayerDeepLiftShap`: [A Unified Approach to Interpreting Model Predictions, Scott M. Lundberg et al. 2017](http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions)
-* `InternalInfluence`: [Influence-Directed Explanations for Deep Convolutional Networks, Klas Leino et al. 2018](https://arxiv.org/pdf/1802.03788.pdf)
+* `InternalInfluence`: [Influence-Directed Explanations for Deep Convolutional Networks, Klas Leino et al. 2018](https://arxiv.org/abs/1802.03788)
 * `Saliency`, `NeuronGradient`: [Deep Inside Convolutional Networks: Visualising
-Image Classification Models and Saliency Maps, K. Simonyan, et. al. 2014](https://arxiv.org/pdf/1312.6034.pdf)
-* `GradCAM`, `Guided GradCAM`: [Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization, Ramprasaath R. Selvaraju et al. 2017](https://arxiv.org/abs/1610.02391.pdf)
-* `Deconvolution`, `Neuron Deconvolution`: [Visualizing and Understanding Convolutional Networks, Matthew D Zeiler et al. 2014](https://arxiv.org/pdf/1311.2901.pdf)
-* `Guided Backpropagation`, `Neuron Guided Backpropagation`: [Striving for Simplicity: The All Convolutional Net, Jost Tobias Springenberg et al. 2015](https://arxiv.org/pdf/1412.6806.pdf)
+Image Classification Models and Saliency Maps, K. Simonyan, et. al. 2014](https://arxiv.org/abs/1312.6034)
+* `GradCAM`, `Guided GradCAM`: [Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization, Ramprasaath R. Selvaraju et al. 2017](https://arxiv.org/abs/1610.02391)
+* `Deconvolution`, `Neuron Deconvolution`: [Visualizing and Understanding Convolutional Networks, Matthew D Zeiler et al. 2014](https://arxiv.org/abs/1311.2901)
+* `Guided Backpropagation`, `Neuron Guided Backpropagation`: [Striving for Simplicity: The All Convolutional Net, Jost Tobias Springenberg et al. 2015](https://arxiv.org/abs/1412.6806)
 * `Feature Permutation`: [Permutation Feature Importance](https://christophm.github.io/interpretable-ml-book/feature-importance.html)
 * `Occlusion`: [Visualizing and Understanding Convolutional Networks](https://arxiv.org/abs/1311.2901)
 * `Shapley Value`: [A value for n-person games. Contributions to the Theory of Games 2.28 (1953): 307-317](https://apps.dtic.mil/dtic/tr/fulltext/u2/604084.pdf)

From 6f84b64dfa0f452bad34d3d8327d44691cd97a41 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 4 Aug 2022 14:10:03 -0600
Subject: [PATCH 61/84] Fix class variable position for Sphinx

---
 captum/attr/_utils/attribution.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index 0ca4f69d52..b7f634d725 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -318,12 +318,13 @@ def multiplies_by_inputs(self):
 
 
 class InternalAttribution(Attribution, Generic[ModuleOrModuleList]):
-    layer: ModuleOrModuleList
     r"""
     Shared base class for LayerAttrubution and NeuronAttribution,
     attribution types that require a model and a particular layer.
     """
 
+    layer: ModuleOrModuleList
+
     def __init__(
         self,
         forward_func: Callable,

From cc1bcb87586cf505b83cd23b360950959f0f3bec Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 7 Aug 2022 11:28:48 -0600
Subject: [PATCH 62/84] Readd `autodoc_process_docstring` for `Callable` &
 `Any`

---
 sphinx/source/conf.py | 50 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index 64db4c5586..f6c2e3053e 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -10,7 +10,9 @@
 # -- Path setup --------------------------------------------------------------
 
 import os
+import re
 import sys
+from typing import List
 
 base_path = os.path.abspath(os.path.join(__file__, "..", "..", ".."))
 # read module from src instead of installation
@@ -206,3 +208,51 @@
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = True
+
+
+# -- Docstring Improvements --------------------------------------------------
+
+
+def _replace_pattern(s: str) -> str:
+    """
+    Wrap a string in regex code so that existing Sphinx formatting is not interfered
+    with. This function ensures that the string will not be replaced if it is inside
+    square brackets '[' & ']'.
+
+    Args:
+
+        s (str): A string to replace.
+
+    Returns:
+        s (str): The input string wrapped in regex code.
+    """
+    return r"(?<![\[])(" + s + r")(?![\]])"
+
+
+def autodoc_process_docstring(
+    app, what: str, name: str, obj, options, lines: List[str]
+) -> None:
+    """
+    Modify docstrings before creating html files.
+    Sphinx converts the 'Args:' and 'Returns:' sections of docstrings into
+    reStructuredText (rST) syntax, which can then be found via ':type' & ':rtype'.
+
+    See here for more information:
+    https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html
+    """
+    for i in range(len(lines)):
+        # Skip unless line is an parameter doc or a return doc
+        if not lines[i].startswith(":type"):
+            continue
+        if ":py:data:" in lines[i]:
+            continue
+
+        # Ensure Any & Callable types of hyperlinked with intersphinx
+        lines[i] = re.sub(_replace_pattern(r"\bAny\b"), "~typing.Any", lines[i])
+        lines[i] = re.sub(
+            _replace_pattern(r"\bCallable\b"), "~typing.Callable", lines[i]
+        )
+
+
+def setup(app) -> None:
+    app.connect("autodoc-process-docstring", autodoc_process_docstring)

From 0f209ef27a2a327b1231da32ff8dd94adc61a472 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 8 Aug 2022 08:58:26 -0600
Subject: [PATCH 63/84] Handle unused attribution base methods in Sphinx docs

---
 sphinx/source/feature_ablation.rst       | 1 +
 sphinx/source/feature_permutation.rst    | 1 +
 sphinx/source/kernel_shap.rst            | 1 +
 sphinx/source/lime.rst                   | 1 +
 sphinx/source/neuron.rst                 | 1 +
 sphinx/source/noise_tunnel.rst           | 1 +
 sphinx/source/occlusion.rst              | 1 +
 sphinx/source/shapley_value_sampling.rst | 2 ++
 8 files changed, 9 insertions(+)

diff --git a/sphinx/source/feature_ablation.rst b/sphinx/source/feature_ablation.rst
index 05467941f3..e337aecf73 100644
--- a/sphinx/source/feature_ablation.rst
+++ b/sphinx/source/feature_ablation.rst
@@ -3,3 +3,4 @@ Feature Ablation
 
 .. autoclass:: captum.attr.FeatureAblation
     :members:
+    :exclude-members: compute_convergence_delta
diff --git a/sphinx/source/feature_permutation.rst b/sphinx/source/feature_permutation.rst
index 6387691cd1..609ff1ff39 100644
--- a/sphinx/source/feature_permutation.rst
+++ b/sphinx/source/feature_permutation.rst
@@ -3,3 +3,4 @@ Feature Permutation
 
 .. autoclass:: captum.attr.FeaturePermutation
     :members:
+    :exclude-members: compute_convergence_delta
diff --git a/sphinx/source/kernel_shap.rst b/sphinx/source/kernel_shap.rst
index 48cfde3535..421ed0ea62 100644
--- a/sphinx/source/kernel_shap.rst
+++ b/sphinx/source/kernel_shap.rst
@@ -3,3 +3,4 @@ KernelShap
 
 .. autoclass:: captum.attr.KernelShap
     :members:
+    :exclude-members: compute_convergence_delta
diff --git a/sphinx/source/lime.rst b/sphinx/source/lime.rst
index 4c722304f1..483458572c 100644
--- a/sphinx/source/lime.rst
+++ b/sphinx/source/lime.rst
@@ -3,6 +3,7 @@ Lime
 
 .. autoclass:: captum.attr.LimeBase
     :members:
+    :exclude-members: compute_convergence_delta
 .. autoclass:: captum.attr.Lime
     :members:
 
diff --git a/sphinx/source/neuron.rst b/sphinx/source/neuron.rst
index 6f894df028..897f237baf 100644
--- a/sphinx/source/neuron.rst
+++ b/sphinx/source/neuron.rst
@@ -54,3 +54,4 @@ Neuron Feature Ablation
 
 .. autoclass:: captum.attr.NeuronFeatureAblation
     :members:
+    :exclude-members: compute_convergence_delta
diff --git a/sphinx/source/noise_tunnel.rst b/sphinx/source/noise_tunnel.rst
index e1aff40b18..15b6ec7dbf 100644
--- a/sphinx/source/noise_tunnel.rst
+++ b/sphinx/source/noise_tunnel.rst
@@ -3,3 +3,4 @@ NoiseTunnel
 
 .. autoclass:: captum.attr.NoiseTunnel
     :members:
+    :exclude-members: compute_convergence_delta
diff --git a/sphinx/source/occlusion.rst b/sphinx/source/occlusion.rst
index a05b236e24..5867d739b9 100644
--- a/sphinx/source/occlusion.rst
+++ b/sphinx/source/occlusion.rst
@@ -3,3 +3,4 @@ Occlusion
 
 .. autoclass:: captum.attr.Occlusion
     :members:
+    :exclude-members: compute_convergence_delta
diff --git a/sphinx/source/shapley_value_sampling.rst b/sphinx/source/shapley_value_sampling.rst
index 667874d805..4d40338540 100644
--- a/sphinx/source/shapley_value_sampling.rst
+++ b/sphinx/source/shapley_value_sampling.rst
@@ -3,5 +3,7 @@ Shapley Value Sampling
 
 .. autoclass:: captum.attr.ShapleyValueSampling
     :members:
+    :exclude-members: compute_convergence_delta
 .. autoclass:: captum.attr.ShapleyValues
     :members:
+    :exclude-members: compute_convergence_delta

From 87d53004e3e66d9c77908f00ecdaf06e77d5022f Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 8 Aug 2022 09:44:21 -0600
Subject: [PATCH 64/84] Improve Sphinx warnings

---
 sphinx/source/pytext.rst | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/sphinx/source/pytext.rst b/sphinx/source/pytext.rst
index 66c847dcd9..f11a6a2099 100644
--- a/sphinx/source/pytext.rst
+++ b/sphinx/source/pytext.rst
@@ -1,11 +1,8 @@
 Captum.Models
 ==========================
 
-.. automodule:: captum.attr._models.pytext
-
-.. autoclass:: PyTextInterpretableEmbedding
+.. autoclass:: captum.attr._models.pytext.PyTextInterpretableEmbedding
     :members:
 
-
-.. autoclass:: BaselineGenerator
+.. autoclass:: captum.attr._models.pytext.BaselineGenerator
     :members:

From 7018ff8462897cedd1a9f4b0d19457ddb29dc5c0 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 11 Aug 2022 09:31:33 -0600
Subject: [PATCH 65/84] Fix docstring

---
 captum/robust/_core/metrics/attack_comparator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/captum/robust/_core/metrics/attack_comparator.py b/captum/robust/_core/metrics/attack_comparator.py
index 9ea7b2cd7a..ac484da866 100644
--- a/captum/robust/_core/metrics/attack_comparator.py
+++ b/captum/robust/_core/metrics/attack_comparator.py
@@ -247,7 +247,7 @@ def evaluate(
 
         Args:
 
-        inputs (Any): Input for which attack metrics
+            inputs (Any): Input for which attack metrics
                 are computed. It can be provided as a tensor, tuple of tensors,
                 or any raw input type (e.g. PIL image or text string).
                 This input is provided directly as input to preproc function as well
@@ -255,7 +255,7 @@ def evaluate(
                 function is provided, this input is provided directly to the main
                 model and all attacks.
 
-        additional_forward_args (Any, optional): If the forward function
+            additional_forward_args (Any, optional): If the forward function
                 requires additional arguments other than the preprocessing
                 outputs (or inputs if preproc_fn is None), this argument
                 can be provided. It must be either a single additional
@@ -268,7 +268,7 @@ def evaluate(
                 correspond to the number of examples. For all other types,
                 the given argument is used for all forward evaluations.
                 Default: ``None``
-        perturbations_per_eval (int, optional): Allows perturbations of multiple
+            perturbations_per_eval (int, optional): Allows perturbations of multiple
                 attacks to be grouped and evaluated in one call of forward_fn
                 Each forward pass will contain a maximum of
                 perturbations_per_eval * #examples samples.
@@ -281,7 +281,7 @@ def evaluate(
                 (or inputs itself if no preproc_fn is provided) must be a tensor
                 or tuple of tensors.
                 Default: ``1``
-        kwargs (Any, optional): Additional keyword arguments provided to metric function
+            kwargs (Any, optional): Additional keyword arguments provided to metric function
                 as well as selected attacks based on chosen additional_args.
                 Default: ``None``
 

From 430ae747ec5bc12e8151ba7974cb0aa81de11686 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 11 Aug 2022 10:41:59 -0600
Subject: [PATCH 66/84] Fix lint error

---
 captum/robust/_core/metrics/attack_comparator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/captum/robust/_core/metrics/attack_comparator.py b/captum/robust/_core/metrics/attack_comparator.py
index ac484da866..030be219a5 100644
--- a/captum/robust/_core/metrics/attack_comparator.py
+++ b/captum/robust/_core/metrics/attack_comparator.py
@@ -281,8 +281,8 @@ def evaluate(
                 (or inputs itself if no preproc_fn is provided) must be a tensor
                 or tuple of tensors.
                 Default: ``1``
-            kwargs (Any, optional): Additional keyword arguments provided to metric function
-                as well as selected attacks based on chosen additional_args.
+            kwargs (Any, optional): Additional keyword arguments provided to metric
+                function as well as selected attacks based on chosen additional_args.
                 Default: ``None``
 
         Returns:

From e639bea5d6b781293649b257a8ad203ed4b14959 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 11 Aug 2022 11:02:19 -0600
Subject: [PATCH 67/84] Fix spelling

---
 scripts/install_via_pip.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/install_via_pip.sh b/scripts/install_via_pip.sh
index 7a13dedb9e..de643e0687 100755
--- a/scripts/install_via_pip.sh
+++ b/scripts/install_via_pip.sh
@@ -37,7 +37,7 @@ export TERM=xterm
 # NOTE: All of the below installs use sudo, b/c otherwise pip will get
 # permission errors installing in the docker container. An alternative would be
 # to use a virtualenv, but that would lead to bifurcation of the CircleCI config
-# since we'd need to source the environemnt in each step.
+# since we'd need to source the environment in each step.
 
 # upgrade pip
 sudo pip install --upgrade pip
@@ -55,7 +55,7 @@ fi
 if [[ $PYTORCH_NIGHTLY == true ]]; then
   sudo pip install --upgrade --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
 else
-  # If no version specified, upgrade to latest release.
+  # If no version is specified, upgrade to the latest release.
   if [[ $CHOSEN_TORCH_VERSION == -1 ]]; then
     sudo pip install --upgrade torch
   else

From b12c6c19ece84fa5cee93c19edf6134dd22685c2 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Fri, 12 Aug 2022 09:10:58 -0600
Subject: [PATCH 68/84] Fix grammar & spelling

---
 docs/contribution_guide.md             |  4 ++--
 docs/extension/integrated_gradients.md | 10 +++++-----
 docs/faq.md                            |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/contribution_guide.md b/docs/contribution_guide.md
index 731e12bfc0..6b635b25ac 100644
--- a/docs/contribution_guide.md
+++ b/docs/contribution_guide.md
@@ -4,7 +4,7 @@ title: The Captum Contribution Process
 ---
 
 The Captum development process involves a healthy amount of open discussions between the core development team and the community.
-Captum operates similar to most open source projects on GitHub. However, if you've never contributed to an open source project before, here is the basic process.
+Captum operates similarly to most open source projects on GitHub. However, if you've never contributed to an open source project before, here is the basic process.
 
 
 1. **Figure out what you're going to work on.**
@@ -59,7 +59,7 @@ https://captum.ai/tutorials/Bert_SQUAD_Interpret
 https://captum.ai/tutorials/IMDB_TorchText_Interpret
 
 **Vision**
-- We provide a sample toy model for CIFAR dataset and examples with ResNet model.
+- We provide a sample toy model for the CIFAR dataset and examples with a ResNet model.
 https://captum.ai/tutorials/CIFAR_TorchVision_Interpret
 https://captum.ai/tutorials/Resnet_TorchVision_Interpret
 These would be great starting points for benchmarking.
diff --git a/docs/extension/integrated_gradients.md b/docs/extension/integrated_gradients.md
index 0a00fb0ad1..ebcca190ec 100644
--- a/docs/extension/integrated_gradients.md
+++ b/docs/extension/integrated_gradients.md
@@ -42,7 +42,7 @@ class ToyModel(nn.Module):
 
 Second, let's apply integrated gradients on the toy model's output layer using sample data.
 The code snippet below computes the attribution of output with respect to the inputs.
-`attribute` method of `IntegratedGradients` class returns input attributions which
+The `attribute` method of `IntegratedGradients` class returns input attributions which
 have the same size and dimensionality as the inputs and an approximation error which
 is computed based on the completeness property of the integrated gradients.
 Completeness property is one of the axioms that integrated gradients satisfies.
@@ -114,7 +114,7 @@ class ToySoftmaxModel(nn.Module):
 Now, let's apply integrated gradients on the toy classification model defined
 above using inputs that contain a range of numbers. We also choose an arbitrary
 target class (target_class_index: 5) which we use to attribute our predictions to.
-Similar to previous example the output of attribution is a tensor with the same
+Similar to the previous example, the output of attribution is a tensor with the same
 dimensionality as the inputs and an approximation error computed based on the
 completeness property of integrated gradients.
 
@@ -157,9 +157,9 @@ Now, let's look at a model that besides input tensors takes input arguments of
 other types. In practice this can be used to pass the sequence length or the
 word/token indices in a sequence of a text, for instance. The example below
 demonstrates how to use `additional_forward_args`. In this particular example
-`additional_forward_args` represents single integer value.
-Those arguments are passed as `additional_forward_args` to `attribute` method and
-they will be passed to model's forward function followed by inputs in the oder
+`additional_forward_args` represents a single integer value.
+Those arguments are passed as `additional_forward_args` to the `attribute` method and
+they will be passed to the model's forward function followed by inputs in the order
 provided in `additional_forward_args`. In the example below, we also demonstrate
 how to apply integrated gradients to a batch of samples. The first dimension of
 the input corresponds to the batch size.
diff --git a/docs/faq.md b/docs/faq.md
index de4e22ea4c..16bf59b54a 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -9,7 +9,7 @@ title: FAQ
 * [Are SmoothGrad or VarGrad supported in Captum?](#are-smoothgrad-or-vargrad-supported-in-captum)
 * [How do I use Captum with BERT models?](#how-do-i-use-captum-with-bert-models)
 * [My model inputs or outputs token indices, and when using Captum I see errors relating to gradients, how do I resolve this?](#my-model-inputs-or-outputs-token-indices-and-when-using-captum-i-see-errors-relating-to-gradients-how-do-i-resolve-this)
-* [Can my model using functional non-linearities (E.g. nn.functional.ReLU) or reused modules be used with Captum?](#can-my-model-using-functional-non-linearities-eg-nnfunctionalrelu-or-reused-modules-be-used-with-captum)
+* [Can my model use functional non-linearities (E.g. nn.functional.ReLU) or can reused modules be used with Captum?](#can-my-model-use-functional-non-linearities-eg-nnfunctionalrelu-or-can-reused-modules-be-used-with-captum)
 * [Do JIT models, DataParallel models, or DistributedDataParallel models work with Captum?](#do-jit-models-dataparallel-models-or-distributeddataparallel-models-work-with-captum)
 * [I am working on a new interpretability or attribution method and would like to add it to Captum. How do I proceed?](#i-am-working-on-a-new-interpretability-or-attribution-method-and-would-like-to-add-it-to-captum-how-do-i-proceed)
 * [I am using a gradient-based attribution algorithm such as integrated gradients for a RNN or LSTM network and I see 'cudnn RNN backward can only be called in training mode'. How can I resolve this issue ?](#how-can-I-resolve-cudnn-RNN-backward-error-for-RNN-or-LSTM-network)
@@ -53,7 +53,7 @@ For NLP models that take token indices as inputs, we cannot take gradients with
 
 If the output of the model is a token index, such as an image captioning cases, it is necessary to attribute with respect to the token score or probability rather than the index. Make sure that the model returns this and use target to choose the appropriate scalar score to attribute with respect to.
 
-### **Can my model using functional non-linearities (E.g. nn.functional.ReLU) or reused modules be used with Captum?**
+### **Can my model use functional non-linearities (E.g. nn.functional.ReLU) or can reused modules be used with Captum?**
 
 Most methods will work fine with functional non-linearities and arbitrary operations. Some methods, which require placing hooks during back-propagation, including DeepLift, DeepLiftShap, Guided Backpropagation, and Deconvolution will not work appropriately with functional non-linearities and must use the corresponding module activation (e.g. torch.nn.ReLU) which should be initialized in the module constructor. For DeepLift, it is important to also not reuse modules in the forward function, since this can cause issues in the propagation of multipliers. Computing layer or neuron attribution with layer modules that are used multiple times generally computes attributions for the last execution of the module. For more information regarding these restrictions, refer to the API documentation for the specific method, including DeepLift, DeepLiftShap, Guided Backpropagation, and Deconvolution.
 

From b300c691005dc15c6baa402f2ca0e7e5d90fa22c Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 14 Aug 2022 09:40:56 -0600
Subject: [PATCH 69/84] Fix doctring & add type hints

---
 captum/attr/_models/base.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/captum/attr/_models/base.py b/captum/attr/_models/base.py
index 3a62281d08..2f526c197a 100644
--- a/captum/attr/_models/base.py
+++ b/captum/attr/_models/base.py
@@ -99,10 +99,10 @@ class TokenReferenceBase:
     `TokenReferenceBase` class.
     """
 
-    def __init__(self, reference_token_idx=0) -> None:
+    def __init__(self, reference_token_idx: int = 0) -> None:
         self.reference_token_idx = reference_token_idx
 
-    def generate_reference(self, sequence_length, device):
+    def generate_reference(self, sequence_length, device: torch.device) -> torch.Tensor:
         r"""
         Generated reference tensor of given `sequence_length` using
         `reference_token_idx`.
@@ -137,22 +137,25 @@ def _set_deep_layer_value(obj, layer_names, value):
     setattr(reduce(getattr, layer_names[:-1], obj), layer_names[-1], value)
 
 
-def configure_interpretable_embedding_layer(model, embedding_layer_name="embedding"):
+def configure_interpretable_embedding_layer(
+    model: Module, embedding_layer_name: str = "embedding"
+) -> InterpretableEmbeddingBase:
     r"""
-    This method wraps model's embedding layer with an interpretable embedding
+    This method wraps a model's embedding layer with an interpretable embedding
     layer that allows us to access the embeddings through their indices.
 
     Args:
 
-        model (torch.nn.Model): An instance of PyTorch model that contains embeddings.
+        model (torch.nn.Module): An instance of PyTorch model that contains embeddings.
         embedding_layer_name (str, optional): The name of the embedding layer
                     in the `model` that we would like to make interpretable.
 
     Returns:
 
-        interpretable_emb (tensor): An instance of `InterpretableEmbeddingBase`
-                    embedding layer that wraps model's embedding layer that is being
-                    accessed through `embedding_layer_name`.
+        interpretable_emb (InterpretableEmbeddingBase): An instance of
+                    `InterpretableEmbeddingBase` embedding layer that wraps model's
+                    embedding layer that is being accessed through
+                    `embedding_layer_name`.
 
     Examples::
 
@@ -202,7 +205,9 @@ def configure_interpretable_embedding_layer(model, embedding_layer_name="embeddi
     return interpretable_emb
 
 
-def remove_interpretable_embedding_layer(model, interpretable_emb):
+def remove_interpretable_embedding_layer(
+    model: Module, interpretable_emb: InterpretableEmbeddingBase
+) -> None:
     r"""
     Removes interpretable embedding layer and sets back original
     embedding layer in the model.
@@ -210,8 +215,8 @@ def remove_interpretable_embedding_layer(model, interpretable_emb):
     Args:
 
         model (torch.nn.Module): An instance of PyTorch model that contains embeddings
-        interpretable_emb (tensor): An instance of `InterpretableEmbeddingBase`
-                    that was originally created in
+        interpretable_emb (InterpretableEmbeddingBase): An instance of
+                    `InterpretableEmbeddingBase` that was originally created in
                     `configure_interpretable_embedding_layer` function and has
                     to be removed after interpretation is finished.
 

From ef48de6ec2f99f55c930ec189a81c90ca2be6c21 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 14 Aug 2022 12:48:27 -0600
Subject: [PATCH 70/84] Fix docstrings

---
 captum/attr/_core/layer/layer_gradient_shap.py   |  2 +-
 captum/attr/_core/layer/layer_lrp.py             |  2 +-
 captum/attr/_core/lrp.py                         |  7 ++++---
 captum/attr/_core/neuron/neuron_gradient_shap.py |  2 +-
 captum/attr/_core/noise_tunnel.py                |  2 +-
 captum/insights/attr_vis/app.py                  |  4 ++--
 captum/metrics/_core/sensitivity.py              | 10 ++++++----
 7 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index 851bebc5ab..35559ea8c8 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -182,7 +182,7 @@ def attribute(
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
-            stdevs    (float, or a tuple of float optional): The standard deviation
+            stdevs    (float, or tuple of float, optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/attr/_core/layer/layer_lrp.py b/captum/attr/_core/layer/layer_lrp.py
index 56795af994..e2ae163f07 100644
--- a/captum/attr/_core/layer/layer_lrp.py
+++ b/captum/attr/_core/layer/layer_lrp.py
@@ -42,7 +42,7 @@ def __init__(self, model: Module, layer: ModuleOrModuleList) -> None:
         """
         Args:
 
-            model (module): The forward function of the model or
+            model (Module): The forward function of the model or
                         any modification of it. Custom rules for a given layer need to
                         be defined as attribute
                         `module.rule` and need to be of type PropagationRule.
diff --git a/captum/attr/_core/lrp.py b/captum/attr/_core/lrp.py
index bd8d887a76..e0c528d542 100644
--- a/captum/attr/_core/lrp.py
+++ b/captum/attr/_core/lrp.py
@@ -45,7 +45,7 @@ def __init__(self, model: Module) -> None:
         r"""
         Args:
 
-            model (module): The forward function of the model or any modification of
+            model (Module): The forward function of the model or any modification of
                 it. Custom rules for a given layer need to be defined as attribute
                 `module.rule` and need to be of type PropagationRule. If no rule is
                 specified for a layer, a pre-defined default rule for the module type
@@ -254,9 +254,10 @@ def compute_convergence_delta(
                         examples, and if multiple input tensors are provided,
                         the examples must be aligned appropriately.
 
-            output (tensor with single element): The output value with respect to which
+            output (tensor): The output value with respect to which
                         the attribution values are computed. This value corresponds to
-                        the target score of a classification model.
+                        the target score of a classification model. The given tensor
+                        should only have a single element.
 
         Returns:
             *tensor*:
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index cd2093bcd7..aad7fe34a3 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -174,7 +174,7 @@ def attribute(
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
-            stdevs    (float, or a tuple of float optional): The standard deviation
+            stdevs    (float, or tuple of float, optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 9cd903b8bf..1f899e9589 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -117,7 +117,7 @@ def attribute(
                         in each batch.
                         Default: None if `nt_samples_batch_size` is not provided. In
                         this case all `nt_samples` will be processed together.
-            stdevs    (float, or a tuple of float optional): The standard deviation
+            stdevs    (float, or tuple of float, optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/insights/attr_vis/app.py b/captum/insights/attr_vis/app.py
index 10cc0a2048..83b59105e4 100644
--- a/captum/insights/attr_vis/app.py
+++ b/captum/insights/attr_vis/app.py
@@ -149,7 +149,7 @@ def __init__(
         r"""
         Args:
 
-            models (torch.nn.module): One or more PyTorch modules (models) for
+            models (torch.nn.Module): One or more PyTorch modules (models) for
                           attribution visualization.
             classes (list of str): List of strings corresponding to the names of
                           classes for classification.
@@ -163,7 +163,7 @@ def __init__(
                           a single BaseFeature, while a multimodal classifier may
                           provide a list of features, each corresponding to a different
                           tensor input and potentially different modalities.
-            dataset (iterable of Batch): Defines the dataset to visualize attributions
+            dataset (Iterable of Batch): Defines the dataset to visualize attributions
                           for. This must be an iterable of batch objects, each of which
                           may contain multiple input examples.
             score_func (Callable, optional): This function is applied to the model
diff --git a/captum/metrics/_core/sensitivity.py b/captum/metrics/_core/sensitivity.py
index 1b58893039..1f4f22877e 100644
--- a/captum/metrics/_core/sensitivity.py
+++ b/captum/metrics/_core/sensitivity.py
@@ -149,10 +149,12 @@ def sensitivity_max(
                 `perturb_func` function.
 
                 Default: 10
-        norm_ord (int, float, inf, -inf, 'fro', 'nuc', optional): The type of norm
-                that is used to compute the
-                norm of the sensitivity matrix which is defined as the difference
-                between the explanation function at its input and perturbed input.
+        norm_ord (int, float, or str, optional): The type of norm that is used to
+                compute the norm of the sensitivity matrix which is defined as the
+                difference between the explanation function at its input and perturbed
+                input. Acceptable values are either a string of 'fro' or 'nuc', or a
+                number in the range of [-inf, inf] (including float("-inf") &
+                float("inf")).
 
                 Default: 'fro'
         max_examples_per_batch (int, optional): The number of maximum input

From 6410f165bdb20741f356436c7e171b7965cc5d38 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 14 Aug 2022 13:11:06 -0600
Subject: [PATCH 71/84] Fix more docstrings

---
 captum/attr/_core/gradient_shap.py                          | 2 +-
 captum/attr/_core/layer/layer_gradient_shap.py              | 2 +-
 captum/attr/_core/neuron/neuron_conductance.py              | 2 +-
 captum/attr/_core/neuron/neuron_deep_lift.py                | 4 ++--
 captum/attr/_core/neuron/neuron_feature_ablation.py         | 2 +-
 captum/attr/_core/neuron/neuron_gradient.py                 | 2 +-
 captum/attr/_core/neuron/neuron_gradient_shap.py            | 4 ++--
 .../attr/_core/neuron/neuron_guided_backprop_deconvnet.py   | 4 ++--
 captum/attr/_core/neuron/neuron_integrated_gradients.py     | 2 +-
 captum/attr/_core/noise_tunnel.py                           | 2 +-
 captum/concept/_core/concept.py                             | 4 ++--
 captum/influence/_core/tracincp.py                          | 6 +++---
 captum/influence/_core/tracincp_fast_rand_proj.py           | 6 +++---
 captum/influence/_utils/common.py                           | 2 +-
 14 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index ff41a3ff0d..04d1599db5 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -162,7 +162,7 @@ def attribute(
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
-            stdevs    (float, or tuple of float, optional): The standard deviation
+            stdevs    (float or tuple of float, optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index 851bebc5ab..6df792584d 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -182,7 +182,7 @@ def attribute(
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
-            stdevs    (float, or a tuple of float optional): The standard deviation
+            stdevs    (float or tuple of float, optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index 66e59dc46c..05f790c776 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -111,7 +111,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, Callable, or tuple of int or slice):
+            neuron_selector (int, Callable, tuple of int, or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index b743e48caa..0d1137e86c 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -99,7 +99,7 @@ def attribute(
                         corresponds to the number of examples (aka batch size),
                         and if multiple input tensors are provided, the examples
                         must be aligned appropriately.
-            neuron_selector (int, Callable, or tuple of int or slice):
+            neuron_selector (int, Callable, tuple of int, or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -344,7 +344,7 @@ def attribute(
                         corresponds to the number of examples (aka batch size),
                         and if multiple input tensors are provided, the examples
                         must be aligned appropriately.
-            neuron_selector (int, Callable, or tuple of int or slice):
+            neuron_selector (int, Callable, tuple of int, or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index 8790697cb2..b5bdadff6b 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -75,7 +75,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, Callable, or tuple of int or slice):
+            neuron_selector (int, Callable, tuple of int, or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index 9c817a8a7b..2b332596c9 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -72,7 +72,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, Callable, or tuple of int or slice):
+            neuron_selector (int, Callable, tuple of int, or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index cd2093bcd7..0408750375 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -114,7 +114,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, Callable, or tuple of int or slice):
+            neuron_selector (int, Callable, tuple of int, or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -174,7 +174,7 @@ def attribute(
                         per sample in the input batch. Random examples are
                         generated by adding gaussian random noise to each sample.
                         Default: `5` if `n_samples` is not provided.
-            stdevs    (float, or a tuple of float optional): The standard deviation
+            stdevs    (float or tuple of float, optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 98d55f4b0b..060109256f 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -78,7 +78,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            neuron_selector (int, Callable, or tuple of int or slice):
+            neuron_selector (int, Callable, tuple of int, or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
@@ -247,7 +247,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            neuron_selector (int, Callable, or tuple of int or slice):
+            neuron_selector (int, Callable, tuple of int, or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index 4ea4d333d6..cce7ee153d 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -92,7 +92,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            neuron_selector (int, Callable, or tuple of int or slice):
+            neuron_selector (int, Callable, tuple of int, or slice):
                         Selector for neuron
                         in given layer for which attribution is desired.
                         Neuron selector can be provided as:
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 9cd903b8bf..17422d31b6 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -117,7 +117,7 @@ def attribute(
                         in each batch.
                         Default: None if `nt_samples_batch_size` is not provided. In
                         this case all `nt_samples` will be processed together.
-            stdevs    (float, or a tuple of float optional): The standard deviation
+            stdevs    (float or tuple of float, optional): The standard deviation
                         of gaussian noise with zero mean that is added to each
                         input in the batch. If `stdevs` is a single float value
                         then that same value is used for all inputs. If it is
diff --git a/captum/concept/_core/concept.py b/captum/concept/_core/concept.py
index f4314fd15b..74ccba2be5 100644
--- a/captum/concept/_core/concept.py
+++ b/captum/concept/_core/concept.py
@@ -25,7 +25,7 @@ def __init__(
 
         r"""
         Args:
-            id (int):   The unique identifier of the concept.
+            id (int): The unique identifier of the concept.
             name (str): A unique name of the concept.
             data_iter (DataLoader): A pytorch DataLoader object that combines a dataset
                         and a sampler, and provides an iterable over a given
@@ -80,7 +80,7 @@ def __init__(self, model: Module) -> None:
 
     Args:
 
-        inputs (tensor or tuple of tensors):  Inputs for which concept-based
+        inputs (tensor or tuple of tensors): Inputs for which concept-based
                     interpretation scores are computed. It can be provided as
                     a single tensor or a tuple of multiple tensors. If multiple
                     input tensors are provided, the batch size (the first
diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index 5390787d84..9b8e767b29 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -216,7 +216,7 @@ def self_influence(
         with are not too large, so that there will not be an out-of-memory error.
 
         Args:
-            batches (tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
@@ -972,7 +972,7 @@ def _self_influence_by_checkpoints(
         times.
 
         Args:
-            batches (tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
@@ -1134,7 +1134,7 @@ def self_influence(
         for each batch. For large models, loading checkpoints can be time-intensive.
 
         Args:
-            batches (tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index 4028ac35bf..5639b04d1d 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -502,7 +502,7 @@ def _self_influence_by_checkpoints(
         times.
 
         Args:
-            batches (tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
@@ -646,7 +646,7 @@ def self_influence(
         for each batch. For large models, loading checkpoints can be time-intensive.
 
         Args:
-            batches (tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
@@ -1059,7 +1059,7 @@ def self_influence(
         with are not too large, so that there will not be an out-of-memory error.
 
         Args:
-            batches (tuple, or DataLoader): Either a single tuple of any, or a
+            batches (tuple or DataLoader): Either a single tuple of any, or a
                     `DataLoader`, where each batch yielded is a tuple of any. In
                     either case, the tuple represents a single batch, where the last
                     element is assumed to be the labels for the batch. That is,
diff --git a/captum/influence/_utils/common.py b/captum/influence/_utils/common.py
index 17ea18bd74..a7d52e839b 100644
--- a/captum/influence/_utils/common.py
+++ b/captum/influence/_utils/common.py
@@ -354,7 +354,7 @@ def _self_influence_by_batches_helper(
         instance_name (str): This is the name of the implementation class that
                 `self_influence_batch_fn` is a method of. This is used for displaying
                 warning messages.
-        batches (tuple, or DataLoader): Either a single tuple of any, or a
+        batches (tuple or DataLoader): Either a single tuple of any, or a
                 `DataLoader`, where each batch yielded is a tuple of any. In
                 either case, the tuple represents a single batch, where the last
                 element is assumed to be the labels for the batch. That is,

From de241007d27e9d16f3be997d997e3737e2052959 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 14 Aug 2022 13:23:26 -0600
Subject: [PATCH 72/84] Iterable types & docstring fixes

---
 captum/influence/_core/tracincp.py                | 4 ++--
 captum/influence/_core/tracincp_fast_rand_proj.py | 4 ++--
 sphinx/source/conf.py                             | 8 +++++++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index 9b8e767b29..96e86c9365 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -132,7 +132,7 @@ def __init__(
                     `model` accepts `L-1` arguments, and the last element of `batch` is
                     the label. In other words, `model(*batch[:-1])` gives the output of
                     `model`, and `batch[-1]` are the labels for the batch.
-            checkpoints (str or list of str or Iterator): Either the directory of the
+            checkpoints (str, list of str, or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
@@ -532,7 +532,7 @@ def __init__(
                     `model` accepts `L-1` arguments, and the last element of `batch` is
                     the label. In other words, `model(*batch[:-1])` gives the output of
                     `model`, and `batch[-1]` are the labels for the batch.
-            checkpoints (str or list of str or Iterator): Either the directory of the
+            checkpoints (str, list of str, or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index 5639b04d1d..a104f393e8 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -115,7 +115,7 @@ def __init__(
                     `model` accepts `L-1` arguments, and the last element of `batch` is
                     the label. In other words, `model(*batch[:-1])` gives the output of
                     `model`, and `batch[-1]` are the labels for the batch.
-            checkpoints (str or list of str or Iterator): Either the directory of the
+            checkpoints (str, list of str, or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
@@ -842,7 +842,7 @@ def __init__(
                     `model` accepts `L-1` arguments, and the last element of `batch` is
                     the label. In other words, `model(*batch[:-1])` gives the output of
                     `model`, and `batch[-1]` are the labels for the batch.
-            checkpoints (str or list of str or Iterator): Either the directory of the
+            checkpoints (str, list of str, or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index f6c2e3053e..0a4e01fcf5 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -247,11 +247,17 @@ def autodoc_process_docstring(
         if ":py:data:" in lines[i]:
             continue
 
-        # Ensure Any & Callable types of hyperlinked with intersphinx
+        # Ensure Any, Callable, & Iterator types are hyperlinked with intersphinx
         lines[i] = re.sub(_replace_pattern(r"\bAny\b"), "~typing.Any", lines[i])
         lines[i] = re.sub(
             _replace_pattern(r"\bCallable\b"), "~typing.Callable", lines[i]
         )
+        lines[i] = re.sub(
+            _replace_pattern(r"\bIterable\b"), "~typing.Iterable", lines[i]
+        )
+        lines[i] = re.sub(
+            _replace_pattern(r"\bIterator\b"), "~typing.Iterator", lines[i]
+        )
 
 
 def setup(app) -> None:

From cd709b05514acbcf8f074d55287d97f6e303cfb7 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Sun, 14 Aug 2022 14:03:10 -0600
Subject: [PATCH 73/84] Fix docstring type

---
 captum/influence/_core/tracincp.py                | 4 ++--
 captum/influence/_core/tracincp_fast_rand_proj.py | 4 ++--
 sphinx/source/conf.py                             | 3 ---
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index 96e86c9365..dec735cb90 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -108,7 +108,7 @@ def __init__(
 
             model (torch.nn.Module): An instance of pytorch model. This model should
                     define all of its layers as attributes of the model.
-            train_dataset (torch.utils.data.Dataset or torch.utils.DataLoader):
+            train_dataset (torch.utils.data.Dataset or torch.utils.data.DataLoader):
                     In the `influence` method, we either compute the influence score of
                     training examples on examples in a test batch, or self influence
                     scores for those training examples, depending on which mode is used.
@@ -508,7 +508,7 @@ def __init__(
 
             model (torch.nn.Module): An instance of pytorch model. This model should
                     define all of its layers as attributes of the model.
-            train_dataset (torch.utils.data.Dataset or torch.utils.DataLoader):
+            train_dataset (torch.utils.data.Dataset or torch.utils.data.DataLoader):
                     In the `influence` method, we either compute the influence score of
                     training examples on examples in a test batch, or self influence
                     scores for those training examples, depending on which mode is used.
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index a104f393e8..e8569908ef 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -91,7 +91,7 @@ def __init__(
                     projection method. Can be either the layer module itself, or the
                     fully qualified name of the layer if it is a defined attribute of
                     the passed `model`.
-            train_dataset (torch.utils.data.Dataset or torch.utils.DataLoader):
+            train_dataset (torch.utils.data.Dataset or torch.utils.data.DataLoader):
                     In the `influence` method, we either compute the influence score of
                     training examples on examples in a test batch, or self influence
                     scores for those training examples, depending on which mode is used.
@@ -818,7 +818,7 @@ def __init__(
                     projection method. Can be either the layer module itself, or the
                     fully qualified name of the layer if it is a defined attribute of
                     the passed `model`.
-            train_dataset (torch.utils.data.Dataset or torch.utils.DataLoader):
+            train_dataset (torch.utils.data.Dataset or torch.utils.data.DataLoader):
                     In the `influence` method, we either compute the influence score of
                     training examples on examples in a test batch, or self influence
                     scores for those training examples, depending on which mode is used.
diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index 0a4e01fcf5..a6586acad0 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -252,9 +252,6 @@ def autodoc_process_docstring(
         lines[i] = re.sub(
             _replace_pattern(r"\bCallable\b"), "~typing.Callable", lines[i]
         )
-        lines[i] = re.sub(
-            _replace_pattern(r"\bIterable\b"), "~typing.Iterable", lines[i]
-        )
         lines[i] = re.sub(
             _replace_pattern(r"\bIterator\b"), "~typing.Iterator", lines[i]
         )

From 5e29522a32f02bc83a6f303ccbc6a67b66cd163f Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Mon, 15 Aug 2022 14:15:37 -0600
Subject: [PATCH 74/84] Remove unnecessary function

* Adding `~typing.` doesn't break any formatting, so we don't need to check that its not inside brackets.
---
 sphinx/source/conf.py | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index a6586acad0..2de876eafa 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -213,22 +213,6 @@
 # -- Docstring Improvements --------------------------------------------------
 
 
-def _replace_pattern(s: str) -> str:
-    """
-    Wrap a string in regex code so that existing Sphinx formatting is not interfered
-    with. This function ensures that the string will not be replaced if it is inside
-    square brackets '[' & ']'.
-
-    Args:
-
-        s (str): A string to replace.
-
-    Returns:
-        s (str): The input string wrapped in regex code.
-    """
-    return r"(?<![\[])(" + s + r")(?![\]])"
-
-
 def autodoc_process_docstring(
     app, what: str, name: str, obj, options, lines: List[str]
 ) -> None:
@@ -248,13 +232,9 @@ def autodoc_process_docstring(
             continue
 
         # Ensure Any, Callable, & Iterator types are hyperlinked with intersphinx
-        lines[i] = re.sub(_replace_pattern(r"\bAny\b"), "~typing.Any", lines[i])
-        lines[i] = re.sub(
-            _replace_pattern(r"\bCallable\b"), "~typing.Callable", lines[i]
-        )
-        lines[i] = re.sub(
-            _replace_pattern(r"\bIterator\b"), "~typing.Iterator", lines[i]
-        )
+        lines[i] = re.sub(r"\bAny\b", "~typing.Any", lines[i])
+        lines[i] = re.sub(r"\bCallable\b", "~typing.Callable", lines[i])
+        lines[i] = re.sub(r"\bIterator\b", "~typing.Iterator", lines[i])
 
 
 def setup(app) -> None:

From 82373fdff771b26d61e12f6225e10eacdef870aa Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Tue, 16 Aug 2022 09:45:36 -0600
Subject: [PATCH 75/84] Improve typing replacement string precision

* generator -> Generator
---
 captum/attr/_core/feature_ablation.py |  2 +-
 sphinx/source/conf.py                 | 17 +++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index 62c14b8e8e..dd1d71868c 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -418,7 +418,7 @@ def _ith_input_ablation_generator(
         This method returns a generator of ablation perturbations of the i-th input
 
         Returns:
-            ablation_iter (generator): yields each perturbation to be evaluated
+            ablation_iter (Generator): yields each perturbation to be evaluated
                         as a tuple (inputs, additional_forward_args, targets, mask).
         """
         extra_args = {}
diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index 2de876eafa..993751bb91 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -213,6 +213,13 @@
 # -- Docstring Improvements --------------------------------------------------
 
 
+# Regex code for typing replacements.
+# The "(?<![\.])" part checks to see if the string
+# starts with a period, and "\b" denotes word boundaries.
+# Only words that don't start with a period are replaced.
+_rt = [r"(?<![\.])(\b", r"\b)"]
+
+
 def autodoc_process_docstring(
     app, what: str, name: str, obj, options, lines: List[str]
 ) -> None:
@@ -231,10 +238,12 @@ def autodoc_process_docstring(
         if ":py:data:" in lines[i]:
             continue
 
-        # Ensure Any, Callable, & Iterator types are hyperlinked with intersphinx
-        lines[i] = re.sub(r"\bAny\b", "~typing.Any", lines[i])
-        lines[i] = re.sub(r"\bCallable\b", "~typing.Callable", lines[i])
-        lines[i] = re.sub(r"\bIterator\b", "~typing.Iterator", lines[i])
+        # Ensure Any, Callable, & Iterator types are hyperlinked with intersphinx.
+        # The tilde '~' character hides the 'typing.' portion of the string.
+        lines[i] = re.sub(_rt[0] + r"Any" + _rt[1], "~typing.Any", lines[i])
+        lines[i] = re.sub(_rt[0] + r"Callable" + _rt[1], "~typing.Callable", lines[i])
+        lines[i] = re.sub(_rt[0] + r"Iterator" + _rt[1], "~typing.Iterator", lines[i])
+        lines[i] = re.sub(_rt[0] + r"Iterable" + _rt[1], "~typing.Iterable", lines[i])
 
 
 def setup(app) -> None:

From 7a5f194ee127afd45d533449f6eb38e8bd335a8c Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 1 Sep 2022 15:39:36 -0600
Subject: [PATCH 76/84] Docstring type formatting changes

---
 captum/_utils/av.py                           |  8 ++++----
 captum/attr/_core/deep_lift.py                |  8 ++++----
 captum/attr/_core/feature_ablation.py         |  4 ++--
 captum/attr/_core/feature_permutation.py      |  2 +-
 captum/attr/_core/gradient_shap.py            |  4 ++--
 .../attr/_core/guided_backprop_deconvnet.py   |  4 ++--
 captum/attr/_core/guided_grad_cam.py          |  4 ++--
 captum/attr/_core/input_x_gradient.py         |  2 +-
 captum/attr/_core/integrated_gradients.py     |  4 ++--
 captum/attr/_core/kernel_shap.py              |  4 ++--
 captum/attr/_core/layer/grad_cam.py           |  4 ++--
 captum/attr/_core/layer/internal_influence.py |  6 +++---
 captum/attr/_core/layer/layer_activation.py   |  2 +-
 captum/attr/_core/layer/layer_conductance.py  |  6 +++---
 captum/attr/_core/layer/layer_deep_lift.py    |  8 ++++----
 .../_core/layer/layer_feature_ablation.py     |  6 +++---
 .../attr/_core/layer/layer_gradient_shap.py   |  8 ++++----
 .../layer/layer_gradient_x_activation.py      |  4 ++--
 .../_core/layer/layer_integrated_gradients.py |  6 +++---
 captum/attr/_core/layer/layer_lrp.py          |  2 +-
 captum/attr/_core/lime.py                     |  6 +++---
 captum/attr/_core/lrp.py                      |  2 +-
 .../attr/_core/neuron/neuron_conductance.py   |  6 +++---
 captum/attr/_core/neuron/neuron_deep_lift.py  |  4 ++--
 .../_core/neuron/neuron_feature_ablation.py   |  4 ++--
 captum/attr/_core/neuron/neuron_gradient.py   |  2 +-
 .../attr/_core/neuron/neuron_gradient_shap.py |  4 ++--
 .../neuron_guided_backprop_deconvnet.py       |  4 ++--
 .../neuron/neuron_integrated_gradients.py     |  4 ++--
 captum/attr/_core/occlusion.py                |  4 ++--
 captum/attr/_core/saliency.py                 |  2 +-
 captum/attr/_core/shapley_value.py            |  8 ++++----
 captum/attr/_utils/attribution.py             |  8 ++++----
 captum/attr/_utils/class_summarizer.py        |  2 +-
 captum/attr/_utils/summarizer.py              |  4 ++--
 captum/attr/_utils/visualization.py           |  8 ++++----
 captum/concept/_core/cav.py                   |  6 +++---
 captum/concept/_core/tcav.py                  | 20 +++++++++----------
 captum/concept/_utils/common.py               |  2 +-
 .../influence/_core/similarity_influence.py   |  2 +-
 captum/influence/_core/tracincp.py            |  8 ++++----
 .../_core/tracincp_fast_rand_proj.py          |  4 ++--
 captum/insights/attr_vis/app.py               |  4 ++--
 captum/insights/attr_vis/features.py          |  2 +-
 captum/metrics/_core/infidelity.py            |  4 ++--
 .../robust/_core/metrics/attack_comparator.py |  2 +-
 46 files changed, 111 insertions(+), 111 deletions(-)

diff --git a/captum/_utils/av.py b/captum/_utils/av.py
index 5332345975..c8248e5ed2 100644
--- a/captum/_utils/av.py
+++ b/captum/_utils/av.py
@@ -211,7 +211,7 @@ def save(
                     AV.generate_dataset_activations from batch index.
                     It assumes identifier is same for all layers if a list of
                     `layers` is provided.
-            layers (str or list of str): The layer(s) for which the activation vectors
+            layers (str or list[str]): The layer(s) for which the activation vectors
                     are computed.
             act_tensors (tensor or list of tensor): A batch of activation vectors.
                     This must match the dimension of `layers`.
@@ -299,7 +299,7 @@ def _manage_loading_layers(
                     for the `layer` are stored.
             model_id (str): The name/version of the model for which layer activations
                     are being computed and stored.
-            layers (str or list of str): The layer(s) for which the activation vectors
+            layers (str or list[str]): The layer(s) for which the activation vectors
                     are computed.
             load_from_disk (bool, optional): Whether or not to load from disk.
                 Default: True
@@ -359,7 +359,7 @@ def _compute_and_save_activations(
                     define all of its layers as attributes of the model.
             model_id (str): The name/version of the model for which layer activations
                     are being computed and stored.
-            layers (str or list of str): The layer(s) for which the activation vectors
+            layers (str or list[str]): The layer(s) for which the activation vectors
                     are computed.
             inputs (tensor or tuple of tensors): Batch of examples for
                     which influential instances are computed. They are passed to the
@@ -435,7 +435,7 @@ def generate_dataset_activations(
                     define all of its layers as attributes of the model.
             model_id (str): The name/version of the model for which layer activations
                     are being computed and stored.
-            layers (str or list of str): The layer(s) for which the activation vectors
+            layers (str or list[str]): The layer(s) for which the activation vectors
                     are computed.
             dataloader (torch.utils.data.DataLoader): DataLoader that yields Dataset
                     for which influential instances are computed. They are passed to
diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 7ac7b7a7a9..4f09ea35e9 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -194,7 +194,7 @@ def attribute(  # type: ignore
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -226,7 +226,7 @@ def attribute(  # type: ignore
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -705,7 +705,7 @@ def attribute(  # type: ignore
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            baselines (tensor, tuple of tensors, Callable):
+            baselines (tensor, tuple of tensors, or Callable):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -730,7 +730,7 @@ def attribute(  # type: ignore
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index dd1d71868c..f1b0e389fa 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -77,7 +77,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define reference value which replaces each
                         feature when ablated.
                         Baselines can be provided as:
@@ -105,7 +105,7 @@ def attribute(
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
index a848262184..26b1015d40 100644
--- a/captum/attr/_core/feature_permutation.py
+++ b/captum/attr/_core/feature_permutation.py
@@ -120,7 +120,7 @@ def attribute(  # type: ignore
                             0 corresponds to the number of examples (aka batch
                             size), and if multiple input tensors are provided,
                             the examples must be aligned appropriately.
-                target (int, tuple, tensor or list, optional): Output indices for
+                target (int, tuple, tensor, or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 04d1599db5..7bfea31ff9 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -135,7 +135,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (tensor, tuple of tensors, Callable):
+            baselines (tensor, tuple of tensors, or Callable):
                         Baselines define the starting point from which expectation
                         is computed and can be provided as:
 
@@ -171,7 +171,7 @@ def attribute(
                         corresponds to the input with the same index in the inputs
                         tuple.
                         Default: 0.0
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/guided_backprop_deconvnet.py b/captum/attr/_core/guided_backprop_deconvnet.py
index 8c89846a1a..bced1927b9 100644
--- a/captum/attr/_core/guided_backprop_deconvnet.py
+++ b/captum/attr/_core/guided_backprop_deconvnet.py
@@ -148,7 +148,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -259,7 +259,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/guided_grad_cam.py b/captum/attr/_core/guided_grad_cam.py
index 6fef60bc0f..4e8cf92750 100644
--- a/captum/attr/_core/guided_grad_cam.py
+++ b/captum/attr/_core/guided_grad_cam.py
@@ -58,7 +58,7 @@ def __init__(
             layer (torch.nn.Module): Layer for which GradCAM attributions are computed.
                           Currently, only layers with a single tensor output are
                           supported.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
@@ -88,7 +88,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/input_x_gradient.py b/captum/attr/_core/input_x_gradient.py
index a69beccef5..8658b370f7 100644
--- a/captum/attr/_core/input_x_gradient.py
+++ b/captum/attr/_core/input_x_gradient.py
@@ -46,7 +46,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/integrated_gradients.py b/captum/attr/_core/integrated_gradients.py
index d7f16edeb6..db327e2bf3 100644
--- a/captum/attr/_core/integrated_gradients.py
+++ b/captum/attr/_core/integrated_gradients.py
@@ -138,7 +138,7 @@ def attribute(  # type: ignore
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define the starting point from which integral
                         is computed and can be provided as:
 
@@ -167,7 +167,7 @@ def attribute(  # type: ignore
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/kernel_shap.py b/captum/attr/_core/kernel_shap.py
index d7ddc6c5c3..183bcf84b3 100644
--- a/captum/attr/_core/kernel_shap.py
+++ b/captum/attr/_core/kernel_shap.py
@@ -94,7 +94,7 @@ def attribute(  # type: ignore
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define the reference value which replaces each
                         feature when the corresponding interpretable feature
                         is set to 0.
@@ -124,7 +124,7 @@ def attribute(  # type: ignore
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which surrogate model is trained
                         (for classification cases,
                         this is usually the target class).
diff --git a/captum/attr/_core/layer/grad_cam.py b/captum/attr/_core/layer/grad_cam.py
index cac8179a69..efb59606bb 100644
--- a/captum/attr/_core/layer/grad_cam.py
+++ b/captum/attr/_core/layer/grad_cam.py
@@ -65,7 +65,7 @@ def __init__(
                           Output size of attribute matches this layer's output
                           dimensions, except for dimension 2, which will be 1,
                           since GradCAM sums over channels.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
@@ -94,7 +94,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index 7a1fc32241..dff61092e2 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -54,7 +54,7 @@ def __init__(
                           the inputs or outputs of the layer, corresponding to
                           attribution of each neuron in the input or output of
                           this layer.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
@@ -86,7 +86,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define a starting point from which integral
                         is computed and can be provided as:
 
@@ -115,7 +115,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_activation.py b/captum/attr/_core/layer/layer_activation.py
index ea03ae25c7..b2762e3710 100644
--- a/captum/attr/_core/layer/layer_activation.py
+++ b/captum/attr/_core/layer/layer_activation.py
@@ -36,7 +36,7 @@ def __init__(
                           this layer. If multiple layers are provided, attributions
                           are returned as a list, each element corresponding to the
                           activations of the corresponding layer.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index 6a09dd831a..b32c449baf 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -57,7 +57,7 @@ def __init__(
                           the inputs or outputs of the layer, corresponding to
                           attribution of each neuron in the input or output of
                           this layer.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
@@ -128,7 +128,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define the starting point from which integral
                         is computed and can be provided as:
 
@@ -157,7 +157,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index 6833e96252..60e2d58f44 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -153,7 +153,7 @@ def attribute(
                         corresponds to the number of examples (aka batch size),
                         and if multiple input tensors are provided, the examples
                         must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -185,7 +185,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -491,7 +491,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            baselines (tensor, tuple of tensors, Callable):
+            baselines (tensor, tuple of tensors, or Callable):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -516,7 +516,7 @@ def attribute(
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_feature_ablation.py b/captum/attr/_core/layer/layer_feature_ablation.py
index fd083a4876..f755b1b810 100644
--- a/captum/attr/_core/layer/layer_feature_ablation.py
+++ b/captum/attr/_core/layer/layer_feature_ablation.py
@@ -50,7 +50,7 @@ def __init__(
                           the inputs or outputs of the layer, corresponding to
                           attribution of each neuron in the input or output of
                           this layer.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself
@@ -83,7 +83,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            layer_baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            layer_baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Layer baselines define reference values which replace each
                         layer input / output value when ablated.
                         Layer baselines should be a single tensor with dimensions
@@ -94,7 +94,7 @@ def attribute(
                         In the cases when `baselines` is not provided, we internally
                         use zero as the baseline for each neuron.
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index 6df792584d..648e13dc7a 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -75,7 +75,7 @@ def __init__(
                         the inputs or outputs of the layer, corresponding to
                         attribution of each neuron in the input or output of
                         this layer.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -155,7 +155,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (tensor, tuple of tensors, Callable):
+            baselines (tensor, tuple of tensors, or Callable):
                         Baselines define the starting point from which expectation
                         is computed and can be provided as:
 
@@ -191,7 +191,7 @@ def attribute(
                         corresponds to the input with the same index in the inputs
                         tuple.
                         Default: 0.0
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -343,7 +343,7 @@ def __init__(
                         the inputs or outputs of the layer, corresponding to
                         attribution of each neuron in the input or output of
                         this layer.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index 8f590956f3..b12f72bd66 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -41,7 +41,7 @@ def __init__(
                           this layer. If multiple layers are provided, attributions
                           are returned as a list, each element corresponding to the
                           attributions of the corresponding layer.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -88,7 +88,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 795e73a06e..a31af1b383 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -73,7 +73,7 @@ def __init__(
                         dependence, e.g.  if you pass in l2 you cannot pass in
                         l1 or l3.
 
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -199,7 +199,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define the starting point from which integral
                         is computed and can be provided as:
 
@@ -227,7 +227,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_lrp.py b/captum/attr/_core/layer/layer_lrp.py
index e2ae163f07..5b32872e4a 100644
--- a/captum/attr/_core/layer/layer_lrp.py
+++ b/captum/attr/_core/layer/layer_lrp.py
@@ -120,7 +120,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                     which gradients are computed (for classification cases,
                     this is usually the target class).
                     If the network returns a scalar value per example,
diff --git a/captum/attr/_core/lime.py b/captum/attr/_core/lime.py
index 8391431712..dbb9b1daf6 100644
--- a/captum/attr/_core/lime.py
+++ b/captum/attr/_core/lime.py
@@ -274,7 +274,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which surrogate model is trained
                         (for classification cases,
                         this is usually the target class).
@@ -887,7 +887,7 @@ def attribute(  # type: ignore
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define reference value which replaces each
                         feature when the corresponding interpretable feature
                         is set to 0.
@@ -917,7 +917,7 @@ def attribute(  # type: ignore
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which surrogate model is trained
                         (for classification cases,
                         this is usually the target class).
diff --git a/captum/attr/_core/lrp.py b/captum/attr/_core/lrp.py
index e0c528d542..c960977b7f 100644
--- a/captum/attr/_core/lrp.py
+++ b/captum/attr/_core/lrp.py
@@ -108,7 +108,7 @@ def attribute(
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
 
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                     which gradients are computed (for classification cases,
                     this is usually the target class).
                     If the network returns a scalar value per example,
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index 05f790c776..8e61bcd685 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -62,7 +62,7 @@ def __init__(
                         Currently, it is assumed that the inputs or the outputs
                         of the layer, depending on which one is used for
                         attribution, can only be a single tensor.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -143,7 +143,7 @@ def attribute(
                           the gradient of output with respect to the intermedite neuron,
                           which cannot be computed for aggregations of multiple
                           intemediate neurons.
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define the starting point from which integral
                         is computed and can be provided as:
 
@@ -172,7 +172,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index 0d1137e86c..8c1b6e10c1 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -133,7 +133,7 @@ def attribute(
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
 
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -378,7 +378,7 @@ def attribute(
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
 
-            baselines (tensor, tuple of tensors, Callable):
+            baselines (tensor, tuple of tensors, or Callable):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index b5bdadff6b..95fabe0750 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -44,7 +44,7 @@ def __init__(
                           Currently, it is assumed that the inputs or the outputs
                           of the layer, depending on which one is used for
                           attribution, can only be a single tensor.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
@@ -109,7 +109,7 @@ def attribute(
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
 
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define reference value which replaces each
                         feature when ablated.
                         Baselines can be provided as:
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index 2b332596c9..541045a6f1 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -44,7 +44,7 @@ def __init__(
                           Currently, it is assumed that the inputs or the outputs
                           of the layer, depending on which one is used for
                           attribution, can only be a single tensor.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index 0408750375..bf3089cee3 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -66,7 +66,7 @@ def __init__(
                         Currently, it is assumed that the inputs or the outputs
                         of the neurons in this layer, depending on which one is
                         used for attribution, can only be a single tensor.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -147,7 +147,7 @@ def attribute(
                           this function returns either a tensor with one element
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
-            baselines (tensor, tuple of tensors, Callable):
+            baselines (tensor, tuple of tensors, or Callable):
                         Baselines define the starting point from which expectation
                         is computed and can be provided as:
 
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 060109256f..10d2d1ffc1 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -48,7 +48,7 @@ def __init__(
                           Currently, it is assumed that the inputs or the outputs
                           of the layer, depending on which one is used for
                           attribution, can only be a single tensor.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
@@ -217,7 +217,7 @@ def __init__(
                           in the attribute method.
                           Currently, only layers with a single tensor output are
                           supported.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                           applies a DataParallel model. This allows reconstruction of
                           intermediate outputs from batched results across devices.
                           If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index cce7ee153d..25bf1c9d30 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -44,7 +44,7 @@ def __init__(
                         Currently, it is assumed that the inputs or the outputs
                         of the layer, depending on which one is used for
                         attribution, can only be a single tensor.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                         applies a DataParallel model. This allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -125,7 +125,7 @@ def attribute(
                           this function returns either a tensor with one element
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
-            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                         Baselines define the starting point from which integral
                         is computed.
                         Baselines can be provided as:
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index 12bae0c6f9..9a2a20d0cd 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -100,7 +100,7 @@ def attribute(  # type: ignore
                             If None is provided, a stride of 1 is used for each
                             dimension of each input tensor.
                             Default: None
-                baselines (scalar, tensor, tuple of scalars or tensors, optional):
+                baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                             Baselines define reference value which replaces each
                             feature when occluded.
                             Baselines can be provided as:
@@ -128,7 +128,7 @@ def attribute(  # type: ignore
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
-                target (int, tuple, tensor or list, optional): Output indices for
+                target (int, tuple, tensor, or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
diff --git a/captum/attr/_core/saliency.py b/captum/attr/_core/saliency.py
index 95feff7478..3f9e9c23c0 100644
--- a/captum/attr/_core/saliency.py
+++ b/captum/attr/_core/saliency.py
@@ -52,7 +52,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/shapley_value.py b/captum/attr/_core/shapley_value.py
index a0d980a54b..1df073f1a4 100644
--- a/captum/attr/_core/shapley_value.py
+++ b/captum/attr/_core/shapley_value.py
@@ -106,7 +106,7 @@ def attribute(
                             to the number of examples (aka batch size), and if
                             multiple input tensors are provided, the examples must
                             be aligned appropriately.
-                baselines (scalar, tensor, tuple of scalars or tensors, optional):
+                baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                             Baselines define reference value which replaces each
                             feature when ablated.
                             Baselines can be provided as:
@@ -135,7 +135,7 @@ def attribute(
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
-                target (int, tuple, tensor or list, optional): Output indices for
+                target (int, tuple, tensor, or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
@@ -559,7 +559,7 @@ def attribute(
                             to the number of examples (aka batch size), and if
                             multiple input tensors are provided, the examples must
                             be aligned appropriately.
-                baselines (scalar, tensor, tuple of scalars or tensors, optional):
+                baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                             Baselines define reference value which replaces each
                             feature when ablated.
                             Baselines can be provided as:
@@ -588,7 +588,7 @@ def attribute(
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
-                target (int, tuple, tensor or list, optional): Output indices for
+                target (int, tuple, tensor, or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index b7f634d725..4e2b2e45ee 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -203,7 +203,7 @@ def compute_convergence_delta(
                             is the end point of attributions' approximation.
                             It is assumed that both `start_point` and `end_point`
                             have the same shape and dimensionality.
-                target (int, tuple, tensor or list, optional): Output indices for
+                target (int, tuple, tensor, or list, optional): Output indices for
                             which gradients are computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
@@ -339,7 +339,7 @@ def __init__(
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
                         Output size of attribute matches that of layer output.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                         applies a DataParallel model, which allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -372,7 +372,7 @@ def __init__(
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
                         Output size of attribute matches that of layer output.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                         applies a DataParallel model, which allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
@@ -442,7 +442,7 @@ def __init__(
                         function.
             layer (torch.nn.Module): Layer for which output attributions are computed.
                         Output size of attribute matches that of layer output.
-            device_ids (list of int): Device ID list, necessary only if forward_func
+            device_ids (list[int]): Device ID list, necessary only if forward_func
                         applies a DataParallel model, which allows reconstruction of
                         intermediate outputs from batched results across devices.
                         If forward_func is given as the DataParallel model itself,
diff --git a/captum/attr/_utils/class_summarizer.py b/captum/attr/_utils/class_summarizer.py
index 9740674136..b8b0921c8d 100644
--- a/captum/attr/_utils/class_summarizer.py
+++ b/captum/attr/_utils/class_summarizer.py
@@ -40,7 +40,7 @@ def update(  # type: ignore
                 The input tensor to be summarised. The first
                 dimension of this input must be associated to
                 the batch size of the inputs.
-            labels (int, tuple, tensor or list, optional):
+            labels (int, tuple, tensor, or list, optional):
                 The associated labels for `x`. If Any, we
                 assume `labels` represents the label for all inputs in `x`.
 
diff --git a/captum/attr/_utils/summarizer.py b/captum/attr/_utils/summarizer.py
index f82ed6357a..2011f3b4d4 100644
--- a/captum/attr/_utils/summarizer.py
+++ b/captum/attr/_utils/summarizer.py
@@ -173,10 +173,10 @@ class SummarizerSingleTensor:
     def __init__(self, stats: List[Stat], summary_stats_indices: List[int]) -> None:
         r"""
         Args:
-            stats (list of Stat): A list of all the Stat objects that
+            stats (list[Stat]): A list of all the Stat objects that
                 need to be updated. This must be in the appropriate order for
                 updates (see `_reorder_stats`)
-            summary_stats (list of int): A list of indicies, referencing `stats`,
+            summary_stats (list[int]): A list of indicies, referencing `stats`,
                 which are the stats you want to show in the .summary property. This
                 does not require any specific order.
         """
diff --git a/captum/attr/_utils/visualization.py b/captum/attr/_utils/visualization.py
index 0f50b0cff8..c4da31b6d6 100644
--- a/captum/attr/_utils/visualization.py
+++ b/captum/attr/_utils/visualization.py
@@ -359,13 +359,13 @@ def visualize_image_attr_multiple(
                     with values in range 0-1 or 0-255. This is a necessary
                     argument for any visualization method which utilizes
                     the original image.
-        methods (list of str): List of strings of length k, defining method
+        methods (list[str]): List of strings of length k, defining method
                         for each visualization. Each method must be a valid
                         string argument for method to visualize_image_attr.
-        signs (list of str): List of strings of length k, defining signs for
+        signs (list[str]): List of strings of length k, defining signs for
                         each visualization. Each sign must be a valid
                         string argument for sign to visualize_image_attr.
-        titles (list of str, optional): List of strings of length k, providing
+        titles (list[str], optional): List of strings of length k, providing
                     a title string for each plot. If None is provided, no titles
                     are added to subplots.
                     Default: None
@@ -505,7 +505,7 @@ def visualize_timeseries_attr(
                         values.
 
                     Default: `absolute_value`
-        channel_labels (list of str, optional): List of labels
+        channel_labels (list[str], optional): List of labels
                     corresponding to each channel in data.
                     Default: None
         channels_last (bool, optional): If True, data is expected to have
diff --git a/captum/concept/_core/cav.py b/captum/concept/_core/cav.py
index 9ded9c4032..6aedb24fff 100644
--- a/captum/concept/_core/cav.py
+++ b/captum/concept/_core/cav.py
@@ -30,7 +30,7 @@ def __init__(
         and loads them from the disk (storage).
 
         Args:
-            concepts (list of Concept): a List of Concept objects. Only their
+            concepts (list[Concept]): a List of Concept objects. Only their
                         names will be saved and loaded.
             layer (str): The layer where concept activation vectors are
                         computed using a predefined classifier.
@@ -65,7 +65,7 @@ def assemble_save_path(
                     layer name.
             model_id (str): A unique model identifier associated with input
                     `layer` and `concepts`
-            concepts (list of Concept): A list of concepts that are concatenated
+            concepts (list[Concept]): A list of concepts that are concatenated
                     together and used as a concept key using their ids. These
                     concept ids are retrieved from TCAV s`Concept` objects.
             layer (str): The name of the layer for which the activations are
@@ -146,7 +146,7 @@ def load(cavs_path: str, model_id: str, concepts: List[Concept], layer: str):
             model_id (str): A unique model identifier associated with the
                     CAVs. There exist a folder named `model_id` under
                     `cavs_path` path. The CAVs are loaded from this folder.
-            concepts (list of Concept): A List of concepts for which
+            concepts (list[Concept]): A List of concepts for which
                     we would like to load the cavs.
             layer (str): The layer name. Ex.: "inception4c". In case of nested
                     layers we use dots to specify the depth / hierarchy.
diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index 8f27f5f3f2..63ea0ee832 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -40,10 +40,10 @@ def __init__(self, datasets: List[AV.AVDataset], labels: List[int]) -> None:
 
         Args:
 
-            datasets (list of Dataset): The k-th element of datasets is a Dataset
+            datasets (list[Dataset]): The k-th element of datasets is a Dataset
                     representing activation vectors associated with the k-th
                     concept
-            labels (list of int): The k-th element of labels is the integer label
+            labels (list[int]): The k-th element of labels is the integer label
                     associated with the k-th concept
         """
         assert len(datasets) == len(
@@ -120,11 +120,11 @@ def train_cav(
         model_id (str): A unique identifier for the PyTorch model for which
                 we would like to load the layer activations and train a
                 model in order to compute CAVs.
-        concepts (list of Concept): A list of Concept objects that are used
+        concepts (list[Concept]): A list of Concept objects that are used
                 to train a classifier and learn decision boundaries between
                 those concepts for each layer defined in the `layers`
                 argument.
-        layers (str, list of str): A list of layer names or a single layer
+        layers (str or list[str]): A list of layer names or a single layer
                 name that is used to compute the activations of all concept
                 examples per concept and train a classifier using those
                 activations.
@@ -258,7 +258,7 @@ def __init__(
 
             model (Module): An instance of pytorch model that is used to compute
                     layer activations and attributions.
-            layers (str, list of str): A list of layer name(s) that are
+            layers (str or list[str]): A list of layer name(s) that are
                     used for computing concept activations (cavs) and layer
                     attributions.
             model_id (str, optional): A unique identifier for the PyTorch `model`
@@ -347,7 +347,7 @@ def generate_activation(self, layers: Union[str, List], concept: Concept) -> Non
         the list of layer(s) `layers`.
 
         Args:
-            layers (str, list of str): A list of layer names or a layer name
+            layers (str or list[str]): A list of layer names or a layer name
                     that is used to compute layer activations for the
                     specific `concept`.
             concept (Concept): A single Concept object that provides access
@@ -384,7 +384,7 @@ def generate_activations(self, concept_layers: Dict[Concept, List[str]]) -> None
         `concept_layers` dictionary.
 
         Args:
-            concept_layers (dict[Concept, list of str]): Dictionay that maps
+            concept_layers (dict[Concept, list[str]]): Dictionay that maps
                     Concept objects to a list of layer names to generate
                     the activations. Ex.: concept_layers =
                     {"striped": ['inception4c', 'inception4d']}
@@ -409,11 +409,11 @@ def load_cavs(
 
         Args:
 
-            concepts (list of Concept): A list of Concept objects for which we want
+            concepts (list[Concept]): A list of Concept objects for which we want
                     to load the CAV.
 
         Returns:
-            layers (list of layer): A list of layers for which some CAVs still need
+            layers (list[layer]): A list of layers for which some CAVs still need
                     to be computed.
             concept_layers (dict[concept, layer]): A dictionay of concept-layers
                     mapping for which we need to perform CAV computation through
@@ -590,7 +590,7 @@ def interpret(
                     provided, the examples must be aligned appropriately.
             experimental_sets (list of list of Concept): A list of list of Concept
                     instances.
-            target (int, tuple, tensor or list, optional): Output indices for
+            target (int, tuple, tensor, or list, optional): Output indices for
                     which attributions are computed (for classification cases,
                     this is usually the target class).
                     If the network returns a scalar value per example,
diff --git a/captum/concept/_utils/common.py b/captum/concept/_utils/common.py
index dece946601..6161736509 100644
--- a/captum/concept/_utils/common.py
+++ b/captum/concept/_utils/common.py
@@ -11,7 +11,7 @@ def concepts_to_str(concepts: List[Concept]) -> str:
     Example output: "striped-random_0-random_1"
 
     Args:
-        concepts (list of Concept): a List of concept names to be
+        concepts (list[Concept]): a List of concept names to be
                 concatenated and used as a concepts key. These concept
                 names are respective to the Concept objects used for
                 the classifier train.
diff --git a/captum/influence/_core/similarity_influence.py b/captum/influence/_core/similarity_influence.py
index d4e131f5e5..9e69e43af5 100644
--- a/captum/influence/_core/similarity_influence.py
+++ b/captum/influence/_core/similarity_influence.py
@@ -82,7 +82,7 @@ def __init__(
         Args:
             module (torch.nn.Module): An instance of pytorch model. This model should
                     define all of its layers as attributes of the model.
-            layers (str or list of str): The fully qualified layer(s) for which the
+            layers (str or list[str]): The fully qualified layer(s) for which the
                     activation vectors are computed.
             influence_src_dataset (torch.utils.data.Dataset): PyTorch Dataset that is
                     used to create a PyTorch Dataloader to iterate over the dataset and
diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index dec735cb90..638d48817a 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -132,7 +132,7 @@ def __init__(
                     `model` accepts `L-1` arguments, and the last element of `batch` is
                     the label. In other words, `model(*batch[:-1])` gives the output of
                     `model`, and `batch[-1]` are the labels for the batch.
-            checkpoints (str, list of str, or Iterator): Either the directory of the
+            checkpoints (str, list[str], or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
@@ -141,7 +141,7 @@ def __init__(
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
                     Default: _load_flexible_state_dict
-            layers (list of str or None, optional): A list of layer names for which
+            layers (list[str] or None, optional): A list of layer names for which
                     gradients should be computed. If `layers` is None, gradients will
                     be computed for all layers. Otherwise, they will only be computed
                     for the layers specified in `layers`.
@@ -532,7 +532,7 @@ def __init__(
                     `model` accepts `L-1` arguments, and the last element of `batch` is
                     the label. In other words, `model(*batch[:-1])` gives the output of
                     `model`, and `batch[-1]` are the labels for the batch.
-            checkpoints (str, list of str, or Iterator): Either the directory of the
+            checkpoints (str, list[str], or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
@@ -541,7 +541,7 @@ def __init__(
                     learning rate if it is saved. By default uses a utility to load a
                     model saved as a state dict.
                     Default: _load_flexible_state_dict
-            layers (list of str or None, optional): A list of layer names for which
+            layers (list[str] or None, optional): A list of layer names for which
                     gradients should be computed. If `layers` is None, gradients will
                     be computed for all layers. Otherwise, they will only be computed
                     for the layers specified in `layers`.
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index e8569908ef..5c9d85828a 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -115,7 +115,7 @@ def __init__(
                     `model` accepts `L-1` arguments, and the last element of `batch` is
                     the label. In other words, `model(*batch[:-1])` gives the output of
                     `model`, and `batch[-1]` are the labels for the batch.
-            checkpoints (str, list of str, or Iterator): Either the directory of the
+            checkpoints (str, list[str], or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
@@ -842,7 +842,7 @@ def __init__(
                     `model` accepts `L-1` arguments, and the last element of `batch` is
                     the label. In other words, `model(*batch[:-1])` gives the output of
                     `model`, and `batch[-1]` are the labels for the batch.
-            checkpoints (str, list of str, or Iterator): Either the directory of the
+            checkpoints (str, list[str], or Iterator): Either the directory of the
                     path to store and retrieve model checkpoints, a list of
                     filepaths with checkpoints from which to load, or an iterator which
                     returns objects from which to load checkpoints.
diff --git a/captum/insights/attr_vis/app.py b/captum/insights/attr_vis/app.py
index 83b59105e4..54ca7f3ac3 100644
--- a/captum/insights/attr_vis/app.py
+++ b/captum/insights/attr_vis/app.py
@@ -151,9 +151,9 @@ def __init__(
 
             models (torch.nn.Module): One or more PyTorch modules (models) for
                           attribution visualization.
-            classes (list of str): List of strings corresponding to the names of
+            classes (list[str]): List of strings corresponding to the names of
                           classes for classification.
-            features (list of BaseFeature): List of BaseFeatures, which correspond
+            features (list[BaseFeature]): List of BaseFeatures, which correspond
                           to input arguments to the model. Each feature object defines
                           relevant transformations for converting to model input,
                           constructing baselines, and visualizing. The length of the
diff --git a/captum/insights/attr_vis/features.py b/captum/insights/attr_vis/features.py
index bd95bc47e9..9a048e57a6 100644
--- a/captum/insights/attr_vis/features.py
+++ b/captum/insights/attr_vis/features.py
@@ -239,7 +239,7 @@ def __init__(self, name: str, categories: List[str]) -> None:
         Args:
             name (str): The label of the specific feature. For example, an
                         ImageFeature's name can be "Photo".
-            categories (list of str): Category labels for the general feature. The
+            categories (list[str]): Category labels for the general feature. The
                         order and size should match the second dimension of the
                         ``data`` tensor parameter in ``visualize``.
         """
diff --git a/captum/metrics/_core/infidelity.py b/captum/metrics/_core/infidelity.py
index e8e1599f1e..50142e08fd 100644
--- a/captum/metrics/_core/infidelity.py
+++ b/captum/metrics/_core/infidelity.py
@@ -221,7 +221,7 @@ def infidelity(
                 multiple input tensors are provided, the examples must
                 be aligned appropriately.
 
-        baselines (scalar, tensor, tuple of scalars or tensors, optional):
+        baselines (scalar, tensor, tuple of scalar, or tensors, optional):
                 Baselines define reference values which sometimes represent ablated
                 values and are used to compare with the actual inputs to compute
                 importance scores in attribution algorithms. They can be represented
@@ -305,7 +305,7 @@ def infidelity(
                 being passed to `perturb_func` as an input argument.
 
                 Default: None
-        target (int, tuple, tensor or list, optional): Indices for selecting
+        target (int, tuple, tensor, or list, optional): Indices for selecting
                 predictions from output(for classification cases,
                 this is usually the target class).
                 If the network returns a scalar value per example, no target
diff --git a/captum/robust/_core/metrics/attack_comparator.py b/captum/robust/_core/metrics/attack_comparator.py
index 030be219a5..7964711883 100644
--- a/captum/robust/_core/metrics/attack_comparator.py
+++ b/captum/robust/_core/metrics/attack_comparator.py
@@ -140,7 +140,7 @@ def add_attack(
                 arguments.
                 Default: ``None``
 
-            additional_attack_arg_names (list of str, optional): Any additional
+            additional_attack_arg_names (list[str], optional): Any additional
                 arguments for the attack which are specific to the particular input
                 example or batch. An example of this is target, which is necessary
                 for some attacks such as FGSM or PGD. These arguments are included

From f15a8081c893a8e05ef00f00bb55008e2ede2433 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 1 Sep 2022 15:51:27 -0600
Subject: [PATCH 77/84] tensor & tensors -> Tensor

---
 captum/_utils/av.py                           |  2 +-
 captum/_utils/gradient.py                     |  4 +--
 captum/_utils/models/linear_model/model.py    |  6 ++---
 captum/attr/_core/deep_lift.py                | 12 ++++-----
 captum/attr/_core/feature_ablation.py         |  8 +++---
 captum/attr/_core/feature_permutation.py      |  6 ++---
 captum/attr/_core/gradient_shap.py            |  6 ++---
 .../attr/_core/guided_backprop_deconvnet.py   |  8 +++---
 captum/attr/_core/guided_grad_cam.py          |  4 +--
 captum/attr/_core/input_x_gradient.py         |  4 +--
 captum/attr/_core/integrated_gradients.py     |  6 ++---
 captum/attr/_core/kernel_shap.py              |  8 +++---
 captum/attr/_core/layer/grad_cam.py           |  4 +--
 captum/attr/_core/layer/internal_influence.py |  6 ++---
 captum/attr/_core/layer/layer_activation.py   |  2 +-
 captum/attr/_core/layer/layer_conductance.py  |  6 ++---
 captum/attr/_core/layer/layer_deep_lift.py    | 12 ++++-----
 .../_core/layer/layer_feature_ablation.py     |  8 +++---
 .../attr/_core/layer/layer_gradient_shap.py   |  6 ++---
 .../layer/layer_gradient_x_activation.py      |  4 +--
 .../_core/layer/layer_integrated_gradients.py |  6 ++---
 captum/attr/_core/layer/layer_lrp.py          |  4 +--
 captum/attr/_core/lime.py                     | 12 ++++-----
 captum/attr/_core/lrp.py                      |  8 +++---
 .../attr/_core/neuron/neuron_conductance.py   |  6 ++---
 captum/attr/_core/neuron/neuron_deep_lift.py  |  8 +++---
 .../_core/neuron/neuron_feature_ablation.py   |  6 ++---
 captum/attr/_core/neuron/neuron_gradient.py   |  2 +-
 .../attr/_core/neuron/neuron_gradient_shap.py |  4 +--
 .../neuron_guided_backprop_deconvnet.py       |  4 +--
 .../neuron/neuron_integrated_gradients.py     |  4 +--
 captum/attr/_core/noise_tunnel.py             |  2 +-
 captum/attr/_core/occlusion.py                |  6 ++---
 captum/attr/_core/saliency.py                 |  4 +--
 captum/attr/_core/shapley_value.py            | 16 ++++++------
 captum/attr/_models/base.py                   |  2 +-
 captum/attr/_utils/attribution.py             | 12 ++++-----
 captum/attr/_utils/class_summarizer.py        |  4 +--
 captum/attr/_utils/summarizer.py              |  2 +-
 captum/concept/_core/concept.py               |  2 +-
 captum/concept/_core/tcav.py                  |  8 +++---
 captum/concept/_utils/classifier.py           |  4 +--
 .../influence/_core/similarity_influence.py   |  2 +-
 captum/influence/_core/tracincp.py            | 20 +++++++-------
 .../_core/tracincp_fast_rand_proj.py          | 26 +++++++++----------
 captum/influence/_utils/common.py             |  8 +++---
 captum/influence/_utils/nearest_neighbors.py  |  8 +++---
 captum/insights/attr_vis/app.py               |  4 +--
 captum/metrics/_core/infidelity.py            | 10 +++----
 captum/metrics/_core/sensitivity.py           |  8 +++---
 captum/metrics/_utils/batching.py             |  2 +-
 captum/robust/_core/fgsm.py                   |  2 +-
 captum/robust/_core/perturbation.py           |  2 +-
 captum/robust/_core/pgd.py                    |  2 +-
 54 files changed, 171 insertions(+), 171 deletions(-)

diff --git a/captum/_utils/av.py b/captum/_utils/av.py
index c8248e5ed2..1b749162f8 100644
--- a/captum/_utils/av.py
+++ b/captum/_utils/av.py
@@ -361,7 +361,7 @@ def _compute_and_save_activations(
                     are being computed and stored.
             layers (str or list[str]): The layer(s) for which the activation vectors
                     are computed.
-            inputs (tensor or tuple of tensors): Batch of examples for
+            inputs (Tensor or tuple of Tensor): Batch of examples for
                     which influential instances are computed. They are passed to the
                     input `model`. The first dimension in `inputs` tensor or tuple of
                     tensors corresponds to the batch size.
diff --git a/captum/_utils/gradient.py b/captum/_utils/gradient.py
index 2b754f05e8..5b853cd435 100644
--- a/captum/_utils/gradient.py
+++ b/captum/_utils/gradient.py
@@ -730,7 +730,7 @@ def _compute_jacobian_wrt_params(
                 but must behave as a library loss function would if `reduction='none'`.
 
     Returns:
-        grads (tuple of tensor): Returns the Jacobian for the minibatch as a
+        grads (tuple of Tensor): Returns the Jacobian for the minibatch as a
                 tuple of gradients corresponding to the tuple of trainable parameters
                 returned by `model.parameters()`. Each object grads[i] references to the
                 gradients for the parameters in the i-th trainable layer of the model.
@@ -804,7 +804,7 @@ def _compute_jacobian_wrt_params_with_sample_wise_trick(
                 Defaults to 'sum'.
 
     Returns:
-        grads (tuple of tensor): Returns the Jacobian for the minibatch as a
+        grads (tuple of Tensor): Returns the Jacobian for the minibatch as a
                 tuple of gradients corresponding to the tuple of trainable parameters
                 returned by `model.parameters()`. Each object grads[i] references to the
                 gradients for the parameters in the i-th trainable layer of the model.
diff --git a/captum/_utils/models/linear_model/model.py b/captum/_utils/models/linear_model/model.py
index 6b8623a560..24302d540c 100644
--- a/captum/_utils/models/linear_model/model.py
+++ b/captum/_utils/models/linear_model/model.py
@@ -65,14 +65,14 @@ def _construct_model_params(
                 normalization parameters used.
             bias (bool):
                 Whether to add a bias term. Not needed if normalized input.
-            weight_values (tensor, optional):
+            weight_values (Tensor, optional):
                 The values to initialize the linear model with. This must be a
                 1D or 2D tensor, and of the form `(num_outputs, num_features)` or
                 `(num_features,)`. Additionally, if this is provided you need not
                 to provide `in_features` or `out_features`.
-            bias_value (tensor, optional):
+            bias_value (Tensor, optional):
                 The bias value to initialize the model with.
-            classes (tensor, optional):
+            classes (Tensor, optional):
                 The list of prediction classes supported by the model in case it
                 performs classificaton. In case of regression it is set to None.
                 Default: None
diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 4f09ea35e9..425aa2a2ee 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -185,7 +185,7 @@ def attribute(  # type: ignore
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which
+            inputs (Tensor or tuple of Tensor): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -194,7 +194,7 @@ def attribute(  # type: ignore
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -226,7 +226,7 @@ def attribute(  # type: ignore
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -696,7 +696,7 @@ def attribute(  # type: ignore
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which
+            inputs (Tensor or tuple of Tensor): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -705,7 +705,7 @@ def attribute(  # type: ignore
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            baselines (tensor, tuple of tensors, or Callable):
+            baselines (Tensor, tuple of Tensor, or Callable):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -730,7 +730,7 @@ def attribute(  # type: ignore
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index f1b0e389fa..862ce085c4 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -68,7 +68,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which ablation
+            inputs (Tensor or tuple of Tensor): Input for which ablation
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -77,7 +77,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define reference value which replaces each
                         feature when ablated.
                         Baselines can be provided as:
@@ -105,7 +105,7 @@ def attribute(
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -145,7 +145,7 @@ def attribute(
                         Note that attributions are not computed with respect
                         to these arguments.
                         Default: None
-            feature_mask (tensor or tuple of tensors, optional):
+            feature_mask (Tensor or tuple of Tensor, optional):
                         feature_mask defines a mask for the input, grouping
                         features which should be ablated together. feature_mask
                         should contain the same number of tensors as inputs.
diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
index 26b1015d40..87b270799b 100644
--- a/captum/attr/_core/feature_permutation.py
+++ b/captum/attr/_core/feature_permutation.py
@@ -110,7 +110,7 @@ def attribute(  # type: ignore
 
 
         Args:
-                inputs (tensor or tuple of tensors): Input for which
+                inputs (Tensor or tuple of Tensor): Input for which
                             permutation attributions are computed. If
                             forward_func takes a single tensor as input, a
                             single input tensor should be provided.  If
@@ -120,7 +120,7 @@ def attribute(  # type: ignore
                             0 corresponds to the number of examples (aka batch
                             size), and if multiple input tensors are provided,
                             the examples must be aligned appropriately.
-                target (int, tuple, tensor, or list, optional): Output indices for
+                target (int, tuple, Tensor, or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
@@ -160,7 +160,7 @@ def attribute(  # type: ignore
                             Note that attributions are not computed with respect
                             to these arguments.
                             Default: None
-                feature_mask (tensor or tuple of tensors, optional):
+                feature_mask (Tensor or tuple of Tensor, optional):
                             feature_mask defines a mask for the input, grouping
                             features which should be ablated together. feature_mask
                             should contain the same number of tensors as inputs.
diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 7bfea31ff9..4db185eeda 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -127,7 +127,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which SHAP attribution
+            inputs (Tensor or tuple of Tensor): Input for which SHAP attribution
                         values are computed. If `forward_func` takes a single
                         tensor as input, a single input tensor should be provided.
                         If `forward_func` takes multiple tensors as input, a tuple
@@ -135,7 +135,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (tensor, tuple of tensors, or Callable):
+            baselines (Tensor, tuple of Tensor, or Callable):
                         Baselines define the starting point from which expectation
                         is computed and can be provided as:
 
@@ -171,7 +171,7 @@ def attribute(
                         corresponds to the input with the same index in the inputs
                         tuple.
                         Default: 0.0
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/guided_backprop_deconvnet.py b/captum/attr/_core/guided_backprop_deconvnet.py
index bced1927b9..21dc3154a2 100644
--- a/captum/attr/_core/guided_backprop_deconvnet.py
+++ b/captum/attr/_core/guided_backprop_deconvnet.py
@@ -139,7 +139,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which
+            inputs (Tensor or tuple of Tensor): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -148,7 +148,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -250,7 +250,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which
+            inputs (Tensor or tuple of Tensor): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -259,7 +259,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/guided_grad_cam.py b/captum/attr/_core/guided_grad_cam.py
index 4e8cf92750..01e53e9a1f 100644
--- a/captum/attr/_core/guided_grad_cam.py
+++ b/captum/attr/_core/guided_grad_cam.py
@@ -80,7 +80,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which attributions
+            inputs (Tensor or tuple of Tensor): Input for which attributions
                         are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -88,7 +88,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/input_x_gradient.py b/captum/attr/_core/input_x_gradient.py
index 8658b370f7..a2a9978032 100644
--- a/captum/attr/_core/input_x_gradient.py
+++ b/captum/attr/_core/input_x_gradient.py
@@ -37,7 +37,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which
+            inputs (Tensor or tuple of Tensor): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -46,7 +46,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/integrated_gradients.py b/captum/attr/_core/integrated_gradients.py
index db327e2bf3..7b16b58466 100644
--- a/captum/attr/_core/integrated_gradients.py
+++ b/captum/attr/_core/integrated_gradients.py
@@ -130,7 +130,7 @@ def attribute(  # type: ignore
 
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which integrated
+            inputs (Tensor or tuple of Tensor): Input for which integrated
                         gradients are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -138,7 +138,7 @@ def attribute(  # type: ignore
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define the starting point from which integral
                         is computed and can be provided as:
 
@@ -167,7 +167,7 @@ def attribute(  # type: ignore
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/kernel_shap.py b/captum/attr/_core/kernel_shap.py
index 183bcf84b3..bf95182747 100644
--- a/captum/attr/_core/kernel_shap.py
+++ b/captum/attr/_core/kernel_shap.py
@@ -86,7 +86,7 @@ def attribute(  # type: ignore
 
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which KernelShap
+            inputs (Tensor or tuple of Tensor): Input for which KernelShap
                         is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -94,7 +94,7 @@ def attribute(  # type: ignore
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define the reference value which replaces each
                         feature when the corresponding interpretable feature
                         is set to 0.
@@ -124,7 +124,7 @@ def attribute(  # type: ignore
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which surrogate model is trained
                         (for classification cases,
                         this is usually the target class).
@@ -167,7 +167,7 @@ def attribute(  # type: ignore
                         Note that attributions are not computed with respect
                         to these arguments.
                         Default: None
-            feature_mask (tensor or tuple of tensors, optional):
+            feature_mask (Tensor or tuple of Tensor, optional):
                         feature_mask defines a mask for the input, grouping
                         features which correspond to the same
                         interpretable feature. feature_mask
diff --git a/captum/attr/_core/layer/grad_cam.py b/captum/attr/_core/layer/grad_cam.py
index efb59606bb..f848ff9b28 100644
--- a/captum/attr/_core/layer/grad_cam.py
+++ b/captum/attr/_core/layer/grad_cam.py
@@ -86,7 +86,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which attributions
+            inputs (Tensor or tuple of Tensor): Input for which attributions
                         are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -94,7 +94,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index dff61092e2..bc40893fd0 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -78,7 +78,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which internal
+            inputs (Tensor or tuple of Tensor): Input for which internal
                         influence is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -86,7 +86,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define a starting point from which integral
                         is computed and can be provided as:
 
@@ -115,7 +115,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_activation.py b/captum/attr/_core/layer/layer_activation.py
index b2762e3710..3c5e63d70b 100644
--- a/captum/attr/_core/layer/layer_activation.py
+++ b/captum/attr/_core/layer/layer_activation.py
@@ -54,7 +54,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which layer
+            inputs (Tensor or tuple of Tensor): Input for which layer
                         activation is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index b32c449baf..f587690572 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -120,7 +120,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which layer
+            inputs (Tensor or tuple of Tensor): Input for which layer
                         conductance is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -128,7 +128,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define the starting point from which integral
                         is computed and can be provided as:
 
@@ -157,7 +157,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index 60e2d58f44..aa31d51b38 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -144,7 +144,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which layer
+            inputs (Tensor or tuple of Tensor): Input for which layer
                         attributions are computed. If forward_func takes a
                         single tensor as input, a single input tensor should be
                         provided. If forward_func takes multiple tensors as input,
@@ -153,7 +153,7 @@ def attribute(
                         corresponds to the number of examples (aka batch size),
                         and if multiple input tensors are provided, the examples
                         must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -185,7 +185,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -482,7 +482,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which layer
+            inputs (Tensor or tuple of Tensor): Input for which layer
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -491,7 +491,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            baselines (tensor, tuple of tensors, or Callable):
+            baselines (Tensor, tuple of Tensor, or Callable):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -516,7 +516,7 @@ def attribute(
 
                         It is recommended that the number of samples in the baselines'
                         tensors is larger than one.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_feature_ablation.py b/captum/attr/_core/layer/layer_feature_ablation.py
index f755b1b810..80e22d154f 100644
--- a/captum/attr/_core/layer/layer_feature_ablation.py
+++ b/captum/attr/_core/layer/layer_feature_ablation.py
@@ -75,7 +75,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which layer
+            inputs (Tensor or tuple of Tensor): Input for which layer
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -83,7 +83,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            layer_baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            layer_baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Layer baselines define reference values which replace each
                         layer input / output value when ablated.
                         Layer baselines should be a single tensor with dimensions
@@ -94,7 +94,7 @@ def attribute(
                         In the cases when `baselines` is not provided, we internally
                         use zero as the baseline for each neuron.
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
@@ -131,7 +131,7 @@ def attribute(
                         Note that attributions are not computed with respect
                         to these arguments.
                         Default: None
-            layer_mask (tensor or tuple of tensors, optional):
+            layer_mask (Tensor or tuple of Tensor, optional):
                         layer_mask defines a mask for the layer, grouping
                         elements of the layer input / output which should be
                         ablated together.
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index 648e13dc7a..4f0cdbe63e 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -146,7 +146,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input which are used to compute
+            inputs (Tensor or tuple of Tensor): Input which are used to compute
                         SHAP attribution values for a given `layer`. If `forward_func`
                         takes a single tensor as input, a single input tensor should
                         be provided.
@@ -155,7 +155,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (tensor, tuple of tensors, or Callable):
+            baselines (Tensor, tuple of Tensor, or Callable):
                         Baselines define the starting point from which expectation
                         is computed and can be provided as:
 
@@ -191,7 +191,7 @@ def attribute(
                         corresponds to the input with the same index in the inputs
                         tuple.
                         Default: 0.0
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index b12f72bd66..5de6e6a78b 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -80,7 +80,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which attributions
+            inputs (Tensor or tuple of Tensor): Input for which attributions
                         are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -88,7 +88,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index a31af1b383..9c259efd2d 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -191,7 +191,7 @@ def attribute(
 
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which layer integrated
+            inputs (Tensor or tuple of Tensor): Input for which layer integrated
                         gradients are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -199,7 +199,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define the starting point from which integral
                         is computed and can be provided as:
 
@@ -227,7 +227,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/layer/layer_lrp.py b/captum/attr/_core/layer/layer_lrp.py
index 5b32872e4a..475a534025 100644
--- a/captum/attr/_core/layer/layer_lrp.py
+++ b/captum/attr/_core/layer/layer_lrp.py
@@ -111,7 +111,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which relevance is
+            inputs (Tensor or tuple of Tensor): Input for which relevance is
                         propagated.
                         If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
@@ -120,7 +120,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                     which gradients are computed (for classification cases,
                     this is usually the target class).
                     If the network returns a scalar value per example,
diff --git a/captum/attr/_core/lime.py b/captum/attr/_core/lime.py
index dbb9b1daf6..87cf5fee8f 100644
--- a/captum/attr/_core/lime.py
+++ b/captum/attr/_core/lime.py
@@ -266,7 +266,7 @@ def attribute(
 
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which LIME
+            inputs (Tensor or tuple of Tensor): Input for which LIME
                         is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -274,7 +274,7 @@ def attribute(
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which surrogate model is trained
                         (for classification cases,
                         this is usually the target class).
@@ -879,7 +879,7 @@ def attribute(  # type: ignore
 
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which LIME
+            inputs (Tensor or tuple of Tensor): Input for which LIME
                         is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -887,7 +887,7 @@ def attribute(  # type: ignore
                         that for all given input tensors, dimension 0 corresponds
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define reference value which replaces each
                         feature when the corresponding interpretable feature
                         is set to 0.
@@ -917,7 +917,7 @@ def attribute(  # type: ignore
                         In the cases when `baselines` is not provided, we internally
                         use zero scalar corresponding to each input tensor.
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which surrogate model is trained
                         (for classification cases,
                         this is usually the target class).
@@ -960,7 +960,7 @@ def attribute(  # type: ignore
                         Note that attributions are not computed with respect
                         to these arguments.
                         Default: None
-            feature_mask (tensor or tuple of tensors, optional):
+            feature_mask (Tensor or tuple of Tensor, optional):
                         feature_mask defines a mask for the input, grouping
                         features which correspond to the same
                         interpretable feature. feature_mask
diff --git a/captum/attr/_core/lrp.py b/captum/attr/_core/lrp.py
index c960977b7f..5e32225382 100644
--- a/captum/attr/_core/lrp.py
+++ b/captum/attr/_core/lrp.py
@@ -99,7 +99,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which relevance is
+            inputs (Tensor or tuple of Tensor): Input for which relevance is
                         propagated. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -108,7 +108,7 @@ def attribute(
                         to the number of examples, and if multiple input tensors
                         are provided, the examples must be aligned appropriately.
 
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                     which gradients are computed (for classification cases,
                     this is usually the target class).
                     If the network returns a scalar value per example,
@@ -246,7 +246,7 @@ def compute_convergence_delta(
 
         Args:
 
-            attributions (tensor or tuple of tensors): Attribution scores that
+            attributions (Tensor or tuple of Tensor): Attribution scores that
                         are precomputed by an attribution algorithm.
                         Attributions can be provided in form of a single tensor
                         or a tuple of those. It is assumed that attribution
@@ -254,7 +254,7 @@ def compute_convergence_delta(
                         examples, and if multiple input tensors are provided,
                         the examples must be aligned appropriately.
 
-            output (tensor): The output value with respect to which
+            output (Tensor): The output value with respect to which
                         the attribution values are computed. This value corresponds to
                         the target score of a classification model. The given tensor
                         should only have a single element.
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index 8e61bcd685..135b44453f 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -103,7 +103,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which neuron
+            inputs (Tensor or tuple of Tensor): Input for which neuron
                         conductance is computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -143,7 +143,7 @@ def attribute(
                           the gradient of output with respect to the intermedite neuron,
                           which cannot be computed for aggregations of multiple
                           intemediate neurons.
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define the starting point from which integral
                         is computed and can be provided as:
 
@@ -172,7 +172,7 @@ def attribute(
                         use zero scalar corresponding to each input tensor.
 
                         Default: None
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index 8c1b6e10c1..97a0dc72db 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -90,7 +90,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which layer
+            inputs (Tensor or tuple of Tensor): Input for which layer
                         attributions are computed. If forward_func takes a
                         single tensor as input, a single input tensor should be
                         provided. If forward_func takes multiple tensors as input,
@@ -133,7 +133,7 @@ def attribute(
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
 
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
@@ -335,7 +335,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which layer
+            inputs (Tensor or tuple of Tensor): Input for which layer
                         attributions are computed. If forward_func takes a
                         single tensor as input, a single input tensor should be
                         provided. If forward_func takes multiple tensors as input,
@@ -378,7 +378,7 @@ def attribute(
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
 
-            baselines (tensor, tuple of tensors, or Callable):
+            baselines (Tensor, tuple of Tensor, or Callable):
                         Baselines define reference samples that are compared with
                         the inputs. In order to assign attribution scores DeepLift
                         computes the differences between the inputs/outputs and
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index 95fabe0750..3b996ffb6f 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -67,7 +67,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which neuron
+            inputs (Tensor or tuple of Tensor): Input for which neuron
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -109,7 +109,7 @@ def attribute(
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
 
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define reference value which replaces each
                         feature when ablated.
                         Baselines can be provided as:
@@ -149,7 +149,7 @@ def attribute(
                         Note that attributions are not computed with respect
                         to these arguments.
                         Default: None
-            feature_mask (tensor or tuple of tensors, optional):
+            feature_mask (Tensor or tuple of Tensor, optional):
                         feature_mask defines a mask for the input, grouping
                         features which should be ablated together. feature_mask
                         should contain the same number of tensors as inputs.
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index 541045a6f1..76b2c30431 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -64,7 +64,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which neuron
+            inputs (Tensor or tuple of Tensor): Input for which neuron
                         gradients are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index bf3089cee3..a142523784 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -106,7 +106,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which SHAP attribution
+            inputs (Tensor or tuple of Tensor): Input for which SHAP attribution
                         values are computed. If `forward_func` takes a single
                         tensor as input, a single input tensor should be provided.
                         If `forward_func` takes multiple tensors as input, a tuple
@@ -147,7 +147,7 @@ def attribute(
                           this function returns either a tensor with one element
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
-            baselines (tensor, tuple of tensors, or Callable):
+            baselines (Tensor, tuple of Tensor, or Callable):
                         Baselines define the starting point from which expectation
                         is computed and can be provided as:
 
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 10d2d1ffc1..7b0ea844af 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -69,7 +69,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which
+            inputs (Tensor or tuple of Tensor): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -238,7 +238,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which
+            inputs (Tensor or tuple of Tensor): Input for which
                         attributions are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index 25bf1c9d30..ebf80a7241 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -84,7 +84,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which neuron integrated
+            inputs (Tensor or tuple of Tensor): Input for which neuron integrated
                         gradients are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -125,7 +125,7 @@ def attribute(
                           this function returns either a tensor with one element
                           or a 1D tensor with length equal to batch_size (one scalar
                           per input example)
-            baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+            baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                         Baselines define the starting point from which integral
                         is computed.
                         Baselines can be provided as:
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 17422d31b6..5fbb01378a 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -95,7 +95,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which integrated
+            inputs (Tensor or tuple of Tensor): Input for which integrated
                         gradients are computed. If forward_func takes a single
                         tensor as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index 9a2a20d0cd..b7e03f5cb4 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -62,7 +62,7 @@ def attribute(  # type: ignore
         r"""
         Args:
 
-                inputs (tensor or tuple of tensors): Input for which occlusion
+                inputs (Tensor or tuple of Tensor): Input for which occlusion
                             attributions are computed. If forward_func takes a single
                             tensor as input, a single input tensor should be provided.
                             If forward_func takes multiple tensors as input, a tuple
@@ -100,7 +100,7 @@ def attribute(  # type: ignore
                             If None is provided, a stride of 1 is used for each
                             dimension of each input tensor.
                             Default: None
-                baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+                baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                             Baselines define reference value which replaces each
                             feature when occluded.
                             Baselines can be provided as:
@@ -128,7 +128,7 @@ def attribute(  # type: ignore
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
-                target (int, tuple, tensor, or list, optional): Output indices for
+                target (int, tuple, Tensor, or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
diff --git a/captum/attr/_core/saliency.py b/captum/attr/_core/saliency.py
index 3f9e9c23c0..91a65e2733 100644
--- a/captum/attr/_core/saliency.py
+++ b/captum/attr/_core/saliency.py
@@ -43,7 +43,7 @@ def attribute(
         r"""
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which saliency
+            inputs (Tensor or tuple of Tensor): Input for which saliency
                         is computed. If forward_func takes a single tensor
                         as input, a single input tensor should be provided.
                         If forward_func takes multiple tensors as input, a tuple
@@ -52,7 +52,7 @@ def attribute(
                         to the number of examples (aka batch size), and if
                         multiple input tensors are provided, the examples must
                         be aligned appropriately.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                         which gradients are computed (for classification cases,
                         this is usually the target class).
                         If the network returns a scalar value per example,
diff --git a/captum/attr/_core/shapley_value.py b/captum/attr/_core/shapley_value.py
index 1df073f1a4..32d3bf0861 100644
--- a/captum/attr/_core/shapley_value.py
+++ b/captum/attr/_core/shapley_value.py
@@ -96,7 +96,7 @@ def attribute(
 
         Args:
 
-                inputs (tensor or tuple of tensors): Input for which Shapley value
+                inputs (Tensor or tuple of Tensor): Input for which Shapley value
                             sampling attributions are computed. If forward_func takes
                             a single tensor as input, a single input tensor should
                             be provided.
@@ -106,7 +106,7 @@ def attribute(
                             to the number of examples (aka batch size), and if
                             multiple input tensors are provided, the examples must
                             be aligned appropriately.
-                baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+                baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                             Baselines define reference value which replaces each
                             feature when ablated.
                             Baselines can be provided as:
@@ -135,7 +135,7 @@ def attribute(
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
-                target (int, tuple, tensor, or list, optional): Output indices for
+                target (int, tuple, Tensor, or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
@@ -175,7 +175,7 @@ def attribute(
                             Note that attributions are not computed with respect
                             to these arguments.
                             Default: None
-                feature_mask (tensor or tuple of tensors, optional):
+                feature_mask (Tensor or tuple of Tensor, optional):
                             feature_mask defines a mask for the input, grouping
                             features which should be added together. feature_mask
                             should contain the same number of tensors as inputs.
@@ -549,7 +549,7 @@ def attribute(
 
         Args:
 
-                inputs (tensor or tuple of tensors): Input for which Shapley value
+                inputs (Tensor or tuple of Tensor): Input for which Shapley value
                             sampling attributions are computed. If forward_func takes
                             a single tensor as input, a single input tensor should
                             be provided.
@@ -559,7 +559,7 @@ def attribute(
                             to the number of examples (aka batch size), and if
                             multiple input tensors are provided, the examples must
                             be aligned appropriately.
-                baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+                baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                             Baselines define reference value which replaces each
                             feature when ablated.
                             Baselines can be provided as:
@@ -588,7 +588,7 @@ def attribute(
                             In the cases when `baselines` is not provided, we internally
                             use zero scalar corresponding to each input tensor.
                             Default: None
-                target (int, tuple, tensor, or list, optional): Output indices for
+                target (int, tuple, Tensor, or list, optional): Output indices for
                             which difference is computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
@@ -628,7 +628,7 @@ def attribute(
                             Note that attributions are not computed with respect
                             to these arguments.
                             Default: None
-                feature_mask (tensor or tuple of tensors, optional):
+                feature_mask (Tensor or tuple of Tensor, optional):
                             feature_mask defines a mask for the input, grouping
                             features which should be added together. feature_mask
                             should contain the same number of tensors as inputs.
diff --git a/captum/attr/_models/base.py b/captum/attr/_models/base.py
index 2f526c197a..0b9e406d73 100644
--- a/captum/attr/_models/base.py
+++ b/captum/attr/_models/base.py
@@ -57,7 +57,7 @@ def forward(self, *inputs, **kwargs):
 
         Returns:
 
-           embedding_tensor (tensor):
+           embedding_tensor (Tensor):
                    Returns a tensor which is the same as first argument passed
                    to the forward function.
                    It passes pre-computed embedding tensors to lower layers
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index 4e2b2e45ee..ae7ed147fe 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -47,7 +47,7 @@ def __init__(self, forward_func: Callable) -> None:
 
     Args:
 
-        inputs (tensor or tuple of tensors): Input for which attribution
+        inputs (Tensor or tuple of Tensor): Input for which attribution
                     is computed. It can be provided as a single tensor or
                     a tuple of multiple tensors. If multiple input tensors
                     are provided, the batch sizes must be aligned across all
@@ -97,7 +97,7 @@ def has_convergence_delta(self) -> bool:
 
     Args:
 
-            attributions (tensor or tuple of tensors): Attribution scores that
+            attributions (Tensor or tuple of Tensor): Attribution scores that
                         are precomputed by an attribution algorithm.
                         Attributions can be provided in form of a single tensor
                         or a tuple of those. It is assumed that attribution
@@ -184,7 +184,7 @@ def compute_convergence_delta(
 
         Args:
 
-                attributions (tensor or tuple of tensors): Precomputed attribution
+                attributions (Tensor or tuple of Tensor): Precomputed attribution
                             scores. The user can compute those using any attribution
                             algorithm. It is assumed the shape and the
                             dimensionality of attributions must match the shape and
@@ -193,17 +193,17 @@ def compute_convergence_delta(
                             dimension 0 corresponds to the number of
                             examples, and if multiple input tensors are provided,
                             the examples must be aligned appropriately.
-                start_point (tensor or tuple of tensors, optional): `start_point`
+                start_point (Tensor or tuple of Tensor, optional): `start_point`
                             is passed as an input to model's forward function. It
                             is the starting point of attributions' approximation.
                             It is assumed that both `start_point` and `end_point`
                             have the same shape and dimensionality.
-                end_point (tensor or tuple of tensors): `end_point`
+                end_point (Tensor or tuple of Tensor): `end_point`
                             is passed as an input to model's forward function. It
                             is the end point of attributions' approximation.
                             It is assumed that both `start_point` and `end_point`
                             have the same shape and dimensionality.
-                target (int, tuple, tensor, or list, optional): Output indices for
+                target (int, tuple, Tensor, or list, optional): Output indices for
                             which gradients are computed (for classification cases,
                             this is usually the target class).
                             If the network returns a scalar value per example,
diff --git a/captum/attr/_utils/class_summarizer.py b/captum/attr/_utils/class_summarizer.py
index b8b0921c8d..664088c299 100644
--- a/captum/attr/_utils/class_summarizer.py
+++ b/captum/attr/_utils/class_summarizer.py
@@ -36,11 +36,11 @@ def update(  # type: ignore
         This accepts either a single tensor to summarise or a tuple of tensors.
 
         Args:
-            x (tensor or tuple of tensor):
+            x (Tensor or tuple of Tensor):
                 The input tensor to be summarised. The first
                 dimension of this input must be associated to
                 the batch size of the inputs.
-            labels (int, tuple, tensor, or list, optional):
+            labels (int, tuple, Tensor, or list, optional):
                 The associated labels for `x`. If Any, we
                 assume `labels` represents the label for all inputs in `x`.
 
diff --git a/captum/attr/_utils/summarizer.py b/captum/attr/_utils/summarizer.py
index 2011f3b4d4..e4c5c860a0 100644
--- a/captum/attr/_utils/summarizer.py
+++ b/captum/attr/_utils/summarizer.py
@@ -193,7 +193,7 @@ def update(self, x: Tensor):
         Updates the summary of a given tensor `x`
 
         Args:
-            x (tensor):
+            x (Tensor):
                 The tensor to summarize
         """
         for stat in self._stats:
diff --git a/captum/concept/_core/concept.py b/captum/concept/_core/concept.py
index 74ccba2be5..b0adbd7f39 100644
--- a/captum/concept/_core/concept.py
+++ b/captum/concept/_core/concept.py
@@ -80,7 +80,7 @@ def __init__(self, model: Module) -> None:
 
     Args:
 
-        inputs (tensor or tuple of tensors): Inputs for which concept-based
+        inputs (Tensor or tuple of Tensor): Inputs for which concept-based
                     interpretation scores are computed. It can be provided as
                     a single tensor or a tuple of multiple tensors. If multiple
                     input tensors are provided, the batch size (the first
diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index 63ea0ee832..73b3c758bb 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -82,9 +82,9 @@ def __getitem__(self, i: int):
             i (int): which (activation vector, label) batch in the dataset to
                     return
         Returns:
-            inputs (tensor): i-th batch in Dataset (representing activation
+            inputs (Tensor): i-th batch in Dataset (representing activation
                     vectors)
-            labels (tensor): labels of i-th batch in Dataset
+            labels (Tensor): labels of i-th batch in Dataset
         """
         assert i < self.length
         k = self._i_to_k(i)
@@ -578,7 +578,7 @@ def interpret(
 
         Args:
 
-            inputs (tensor or tuple of tensors): Inputs for which predictions
+            inputs (Tensor or tuple of Tensor): Inputs for which predictions
                     are performed and attributions are computed.
                     If model takes a single tensor as
                     input, a single input tensor should be provided.
@@ -590,7 +590,7 @@ def interpret(
                     provided, the examples must be aligned appropriately.
             experimental_sets (list of list of Concept): A list of list of Concept
                     instances.
-            target (int, tuple, tensor, or list, optional): Output indices for
+            target (int, tuple, Tensor, or list, optional): Output indices for
                     which attributions are computed (for classification cases,
                     this is usually the target class).
                     If the network returns a scalar value per example,
diff --git a/captum/concept/_utils/classifier.py b/captum/concept/_utils/classifier.py
index 73092edda8..b8ba7d0a59 100644
--- a/captum/concept/_utils/classifier.py
+++ b/captum/concept/_utils/classifier.py
@@ -95,7 +95,7 @@ def weights(self) -> Tensor:
         C is the number of classes and F is the number of features.
 
         Returns:
-            weights (tensor): A torch Tensor with the weights resulting from
+            weights (Tensor): A torch Tensor with the weights resulting from
                 the model training.
         """
         pass
@@ -192,7 +192,7 @@ def weights(self) -> Tensor:
         In case of binary classification, C = 2 otherwise it is > 2.
 
         Returns:
-            weights (tensor): A torch Tensor with the weights resulting from
+            weights (Tensor): A torch Tensor with the weights resulting from
                 the model training.
         """
         assert self.lm.linear is not None, (
diff --git a/captum/influence/_core/similarity_influence.py b/captum/influence/_core/similarity_influence.py
index 9e69e43af5..db1484466e 100644
--- a/captum/influence/_core/similarity_influence.py
+++ b/captum/influence/_core/similarity_influence.py
@@ -166,7 +166,7 @@ def influence(  # type: ignore[override]
     ) -> Dict:
         r"""
         Args:
-            inputs (tensor or tuple of tensors): Batch of examples for which influential
+            inputs (Tensor or tuple of Tensor): Batch of examples for which influential
                     instances are computed. They are passed to the forward_func. The
                     first dimension in `inputs` tensor or tuple of tensors corresponds
                     to the batch size. A tuple of tensors is only passed in if this
diff --git a/captum/influence/_core/tracincp.py b/captum/influence/_core/tracincp.py
index 638d48817a..8cb2ac7bfc 100644
--- a/captum/influence/_core/tracincp.py
+++ b/captum/influence/_core/tracincp.py
@@ -244,7 +244,7 @@ def self_influence(
                     Default: False
 
         Returns:
-            self_influence_scores (tensor): This is a 1D tensor containing the self
+            self_influence_scores (Tensor): This is a 1D tensor containing the self
                     influence scores of all examples in `inputs_dataset`, regardless of
                     whether it represents a single batch or a `DataLoader` that yields
                     batches.
@@ -265,7 +265,7 @@ def _get_k_most_influential(
 
             inputs (tuple of Any): A tuple that represents a batch of examples. It does
                     not represent labels, which are passed as `targets`.
-            targets (tensor, optional): If computing influence scores on a loss
+            targets (Tensor, optional): If computing influence scores on a loss
                     function, these are the labels corresponding to the batch `inputs`.
                     Default: None
             k (int, optional): The number of proponents or opponents to return per test
@@ -315,13 +315,13 @@ def _influence(
             inputs (tuple of Any): A batch of examples. Does not represent labels,
                     which are passed as `targets`. The assumption is that
                     `model(*inputs)` produces the predictions for the batch.
-            targets (tensor, optional): If computing influence scores on a loss
+            targets (Tensor, optional): If computing influence scores on a loss
                     function, these are the labels corresponding to the batch
                     `inputs`.
                     Default: None
 
         Returns:
-            influence_scores (tensor): Influence scores over the entire
+            influence_scores (Tensor): Influence scores over the entire
                     training dataset `train_dataset`. Dimensionality is
                     (inputs_batch_size, src_dataset_size). For example:
                     influence_scores[i][j] = the influence score for the j-th training
@@ -384,7 +384,7 @@ def influence(  # type: ignore[override]
                     `inputs` will need to be a tuple. In other words, `inputs` will be
                     unpacked as an argument when passing to `model`.
                     Default: None
-            targets (tensor, optional): If computing influence scores on a loss
+            targets (Tensor, optional): If computing influence scores on a loss
                     function, these are the labels corresponding to the batch `inputs`.
                     Default: None
             k (int, optional): If not provided or `None`, the influence score mode will
@@ -715,7 +715,7 @@ def influence(  # type: ignore[override]
                     `inputs` will need to be a tuple. In other words, `inputs` will be
                     unpacked as an argument when passing to `model`.
                     Default: None
-            targets (tensor, optional): If computing influence scores on a loss
+            targets (Tensor, optional): If computing influence scores on a loss
                     function, these are the labels corresponding to the batch `inputs`.
                     Default: None
             k (int, optional): If not provided or `None`, the influence score mode will
@@ -837,7 +837,7 @@ def _influence(
             inputs (tuple of Any): A test batch of examples. Does not represent labels,
                     which are passed as `targets`. The assumption is that
                     `model(*inputs)` produces the predictions for the batch.
-            targets (tensor, optional): If computing influence scores on a loss
+            targets (Tensor, optional): If computing influence scores on a loss
                     function, these are the labels corresponding to the batch `inputs`.
                     Default: None
             show_progress (bool, optional): To compute the influence of examples in
@@ -851,7 +851,7 @@ def _influence(
                     Default: False
 
         Returns:
-            influence_scores (tensor): Influence scores from the TracInCP method.
+            influence_scores (Tensor): Influence scores from the TracInCP method.
             Its shape is `(input_size, train_dataset_size)`, where `input_size`
             is the number of examples in the test batch, and
             `train_dataset_size` is the number of examples in
@@ -892,7 +892,7 @@ def _get_k_most_influential(
 
             inputs (tuple of Any): A tuple that represents a batch of examples. It does
                     not represent labels, which are passed as `targets`.
-            targets (tensor, optional): If computing influence scores on a loss
+            targets (Tensor, optional): If computing influence scores on a loss
                     function, these are the labels corresponding to the batch `inputs`.
                     Default: None
             k (int, optional): The number of proponents or opponents to return per test
@@ -997,7 +997,7 @@ def _self_influence_by_checkpoints(
                     Default: False
 
         Returns:
-            self_influence_scores (tensor): This is a 1D tensor containing the self
+            self_influence_scores (Tensor): This is a 1D tensor containing the self
                     influence scores of all examples in `inputs_dataset`, regardless of
                     whether it represents a single batch or a `DataLoader` that yields
                     batches.
diff --git a/captum/influence/_core/tracincp_fast_rand_proj.py b/captum/influence/_core/tracincp_fast_rand_proj.py
index 5c9d85828a..2d9bcaa519 100644
--- a/captum/influence/_core/tracincp_fast_rand_proj.py
+++ b/captum/influence/_core/tracincp_fast_rand_proj.py
@@ -240,7 +240,7 @@ def influence(  # type: ignore[override]
                     `inputs` will need to be a tuple. In other words, `inputs` will be
                     unpacked as an argument when passing to `model`.
                     Default: None
-            targets (tensor, optional): The labels corresponding to the batch `inputs`.
+            targets (Tensor, optional): The labels corresponding to the batch `inputs`.
                     This method is designed to be applied for a loss function, so
                     `targets` is required, unless running in "self influence" mode.
                     Default: None
@@ -365,7 +365,7 @@ def _influence(  # type: ignore[override]
             inputs (tuple of Any): A batch of examples. Does not represent labels,
                     which are passed as `targets`. The assumption is that
                     `model(*inputs)` produces the predictions for the batch.
-            targets (tensor): The labels corresponding to the batch `inputs`. This
+            targets (Tensor): The labels corresponding to the batch `inputs`. This
                     method is designed to be applied for a loss function, so labels
                     are required.
             show_progress (bool, optional): To compute the influence of examples in
@@ -379,7 +379,7 @@ def _influence(  # type: ignore[override]
                     Default: False
 
         Returns:
-            influence_scores (tensor): Influence scores from the TracInCPFast method.
+            influence_scores (Tensor): Influence scores from the TracInCPFast method.
             Its shape is `(input_size, train_dataset_size)`, where `input_size`
             is the number of examples in the test batch, and
             `train_dataset_size` is the number of examples in
@@ -422,7 +422,7 @@ def _get_k_most_influential(  # type: ignore[override]
 
             inputs (tuple of Any): A tuple that represents a batch of examples. It does
                     not represent labels, which are passed as `targets`.
-            targets (tensor): The labels corresponding to the batch `inputs`. This
+            targets (Tensor): The labels corresponding to the batch `inputs`. This
                     method is designed to be applied for a loss function, so labels
                     are required.
             k (int, optional): The number of proponents or opponents to return per test
@@ -527,7 +527,7 @@ def _self_influence_by_checkpoints(
                     Default: False
 
         Returns:
-            self_influence_scores (tensor): This is a 1D tensor containing the self
+            self_influence_scores (Tensor): This is a 1D tensor containing the self
                     influence scores of all examples in `inputs_dataset`, regardless of
                     whether it represents a single batch or a `DataLoader` that yields
                     batches.
@@ -710,7 +710,7 @@ def _basic_computation_tracincp_fast(
                 or test batch, depending which method is the caller. Does not
                 represent labels, which are passed as `targets`. The assumption is
                 that `model(*inputs)` produces the predictions for the batch.
-        targets (tensor): If computing influence scores on a loss function,
+        targets (Tensor): If computing influence scores on a loss function,
                 these are the labels corresponding to the batch `inputs`.
     """
     layer_inputs: Dict[device, Tuple[Tensor, ...]] = defaultdict()
@@ -957,12 +957,12 @@ def _influence(  # type: ignore[override]
             inputs (tuple of Any): A batch of examples. Does not represent labels,
                     which are passed as `targets`. The assumption is that
                     `model(*inputs)` produces the predictions for the batch.
-            targets (tensor): The labels corresponding to the batch `inputs`. This
+            targets (Tensor): The labels corresponding to the batch `inputs`. This
                     method is designed to be applied for a loss function, so labels
                     are required.
 
         Returns:
-            influence_scores (tensor): Influence scores from the
+            influence_scores (Tensor): Influence scores from the
             TracInCPFastRandProj method. Its shape is
             `(input_size, train_dataset_size)`, where `input_size` is the
             number of examples in the test batch, and `train_dataset_size` is
@@ -994,7 +994,7 @@ def _get_k_most_influential(  # type: ignore[override]
 
             inputs (tuple of Any): A tuple that represents a batch of examples. It does
                     not represent labels, which are passed as `targets`.
-            targets (tensor): The labels corresponding to the batch `inputs`. This
+            targets (Tensor): The labels corresponding to the batch `inputs`. This
                     method is designed to be applied for a loss function, so labels
                     are required.
             k (int, optional): The number of proponents or opponents to return per test
@@ -1089,7 +1089,7 @@ def self_influence(
                     Default: False
 
         Returns:
-            self_influence_scores (tensor): This is a 1D tensor containing the self
+            self_influence_scores (Tensor): This is a 1D tensor containing the self
                     influence scores of all examples in `inputs_dataset`, regardless of
                     whether it represents a single batch or a `DataLoader` that yields
                     batches.
@@ -1157,7 +1157,7 @@ def influence(  # type: ignore[override]
                     `inputs` will need to be a tuple. In other words, `inputs` will be
                     unpacked as an argument when passing to `model`.
                     Default: None
-            targets (tensor): The labels corresponding to the batch `inputs`. This
+            targets (Tensor): The labels corresponding to the batch `inputs`. This
                     method is designed to be applied for a loss function, so `targets`
                     is required.
             k (int, optional): If not provided or `None`, the influence score mode will
@@ -1322,7 +1322,7 @@ def _process_src_intermediate_quantities_tracincp_fast_rand_proj(
 
         Args:
 
-            src_intermediate_quantities (tensor): the output of the
+            src_intermediate_quantities (Tensor): the output of the
                     `_get_intermediate_quantities_tracin_fast_rand_proj` function when
                     applied to training dataset `train_dataset`. This
                     output is the vector representation of all training examples.
@@ -1354,7 +1354,7 @@ def _get_intermediate_quantities_tracincp_fast_rand_proj(
                     projection is to be applied.
 
         Returns:
-            checkpoint_projections (tensor): A tensor of dimension
+            checkpoint_projections (Tensor): A tensor of dimension
                     (N, D * C), where N is total number of examples in `dataloader`, C
                     is the number of checkpoints passed as the `checkpoints` argument
                     of `TracInCPFastRandProj.__init__`, and each row represents the
diff --git a/captum/influence/_utils/common.py b/captum/influence/_utils/common.py
index 79fdcd7c2b..356f09b8e6 100644
--- a/captum/influence/_utils/common.py
+++ b/captum/influence/_utils/common.py
@@ -91,12 +91,12 @@ def _jacobian_loss_wrt_inputs(
                 torch.nn.Module. If a custom loss is provided, it can be either type,
                 but must behave as a library loss function would if `reduction='sum'`
                 or `reduction='mean'`.
-        out (tensor): This is a tensor that represents the batch of inputs to
+        out (Tensor): This is a tensor that represents the batch of inputs to
                 `loss_fn`. In practice, this will be the output of a model; this is
                 why this argument is named `out`. `out` is a 2D tensor of shape
                 (batch size, model output dimensionality). We will call `loss_fn` via
                 `loss_fn(out, targets)`.
-        targets (tensor): The labels for the batch of inputs.
+        targets (Tensor): The labels for the batch of inputs.
         vectorize (bool): Flag to use experimental vectorize functionality for
                 `torch.autograd.functional.jacobian`.
         reduction_type (str): The type of reduction used by `loss_fn`. If `loss_fn`
@@ -104,7 +104,7 @@ def _jacobian_loss_wrt_inputs(
                 only be "mean" or "sum".
 
     Returns:
-        jacobians (tensor): Returns the jacobian of the per-sample loss (implicitly
+        jacobians (Tensor): Returns the jacobian of the per-sample loss (implicitly
                 defined by `loss_fn` and `reduction_type`) w.r.t each sample
                 in the batch represented by `out`. This is a 2D tensor, where the
                 first dimension is the batch dimension.
@@ -206,7 +206,7 @@ def _get_k_most_influential_helper(
                 in the `influence_src_dataloader` argument.
         inputs (tuple of Any): A batch of examples. Does not represent labels,
                 which are passed as `targets`.
-        targets (tensor, optional): If computing TracIn scores on a loss function,
+        targets (Tensor, optional): If computing TracIn scores on a loss function,
                 these are the labels corresponding to the batch `inputs`.
                 Default: None
         k (int, optional): The number of proponents or opponents to return per test
diff --git a/captum/influence/_utils/nearest_neighbors.py b/captum/influence/_utils/nearest_neighbors.py
index 3ecd452de3..fa8d6d7136 100644
--- a/captum/influence/_utils/nearest_neighbors.py
+++ b/captum/influence/_utils/nearest_neighbors.py
@@ -34,7 +34,7 @@ def get_nearest_neighbors(
         so that `query` is 2D.
 
         Args:
-            query (tensor): tensor representing the batch of tensors for which k-nearest
+            query (Tensor): tensor representing the batch of tensors for which k-nearest
                     neighbors are desired. `query` is of shape (N, *), where N is the
                     size of the batch, i.e. the 0-th dimension of `query` indexes the
                     batch. * denotes an arbitrary shape, so that each tensor in the
@@ -68,7 +68,7 @@ def setup(self, data: torch.Tensor) -> None:
         dimension indexes the tensors in the stored tensors.
 
         Args:
-            data (tensor): A tensor of shape (N, *) representing the stored tensors.
+            data (Tensor): A tensor of shape (N, *) representing the stored tensors.
                     The 0-th dimension indexes the tensors in the stored tensors,
                     so that `data[i]` is the tensor with index `i`. The nearest
                     neighbors of a query will be referred to by their index.
@@ -129,7 +129,7 @@ def setup(self, data: torch.Tensor) -> None:
         tensors.
 
         Args:
-            data (tensor): A tensor of shape (N, *) representing the stored tensors.
+            data (Tensor): A tensor of shape (N, *) representing the stored tensors.
                     The 0-th dimension indexes the tensors in the stored tensors,
                     so that `data[i]` is the tensor with index `i`. The nearest
                     neighbors of a query will be referred to by their index.
@@ -160,7 +160,7 @@ def get_nearest_neighbors(
         dot-product of the flattened version of tensors.
 
         Args:
-            query (tensor): tensor representing the batch of tensors for which k-nearest
+            query (Tensor): tensor representing the batch of tensors for which k-nearest
                     neighbors are desired. `query` is of shape (N, *), where N is the
                     size of the batch, i.e. the 0-th dimension of `query` indexes the
                     batch. * denotes an arbitrary shape, so that each tensor in the
diff --git a/captum/insights/attr_vis/app.py b/captum/insights/attr_vis/app.py
index 54ca7f3ac3..fe7e0bbcda 100644
--- a/captum/insights/attr_vis/app.py
+++ b/captum/insights/attr_vis/app.py
@@ -108,7 +108,7 @@ def __init__(
 
         Args:
 
-            inputs (tensor or tuple of tensors): Batch of inputs for a model.
+            inputs (Tensor or tuple of Tensor): Batch of inputs for a model.
                         These may be either a Tensor or tuple of tensors. Each tensor
                         must correspond to a feature for AttributionVisualizer, and
                         the corresponding input transform function of the feature
@@ -116,7 +116,7 @@ def __init__(
                         model. It is assumed that the first dimension of each
                         input tensor corresponds to the number of examples
                         (batch size) and is aligned for all input tensors.
-            labels (tensor): Tensor containing correct labels for input examples.
+            labels (Tensor): Tensor containing correct labels for input examples.
                         This must be a 1D tensor with length matching the first
                         dimension of each input tensor.
             additional_args (tuple, optional): If the forward function
diff --git a/captum/metrics/_core/infidelity.py b/captum/metrics/_core/infidelity.py
index 50142e08fd..a10b2e2812 100644
--- a/captum/metrics/_core/infidelity.py
+++ b/captum/metrics/_core/infidelity.py
@@ -211,7 +211,7 @@ def infidelity(
                 input examples that are repeated `max_examples_per_batch / batch_size`
                 times within the batch.
 
-        inputs (tensor or tuple of tensors): Input for which
+        inputs (Tensor or tuple of Tensor): Input for which
                 attributions are computed. If forward_func takes a single
                 tensor as input, a single input tensor should be provided.
                 If forward_func takes multiple tensors as input, a tuple
@@ -221,7 +221,7 @@ def infidelity(
                 multiple input tensors are provided, the examples must
                 be aligned appropriately.
 
-        baselines (scalar, tensor, tuple of scalar, or tensors, optional):
+        baselines (scalar, Tensor, tuple of scalar, or Tensor, optional):
                 Baselines define reference values which sometimes represent ablated
                 values and are used to compare with the actual inputs to compute
                 importance scores in attribution algorithms. They can be represented
@@ -250,7 +250,7 @@ def infidelity(
 
                 Default: None
 
-        attributions (tensor or tuple of tensors):
+        attributions (Tensor or tuple of Tensor):
                 Attribution scores computed based on an attribution algorithm.
                 This attribution scores can be computed using the implementations
                 provided in the `captum.attr` package. Some of those attribution
@@ -305,7 +305,7 @@ def infidelity(
                 being passed to `perturb_func` as an input argument.
 
                 Default: None
-        target (int, tuple, tensor, or list, optional): Indices for selecting
+        target (int, tuple, Tensor, or list, optional): Indices for selecting
                 predictions from output(for classification cases,
                 this is usually the target class).
                 If the network returns a scalar value per example, no target
@@ -366,7 +366,7 @@ def infidelity(
                 Default: False
     Returns:
 
-        infidelities (tensor): A tensor of scalar infidelity scores per
+        infidelities (Tensor): A tensor of scalar infidelity scores per
                 input example. The first dimension is equal to the
                 number of examples in the input batch and the second
                 dimension is one.
diff --git a/captum/metrics/_core/sensitivity.py b/captum/metrics/_core/sensitivity.py
index 1f4f22877e..f0c841a5a1 100644
--- a/captum/metrics/_core/sensitivity.py
+++ b/captum/metrics/_core/sensitivity.py
@@ -30,7 +30,7 @@ def default_perturb_func(
 
     Args:
 
-        inputs (tensor or a tuple of tensors): The input tensors that we'd
+        inputs (Tensor or tuple of Tensor): The input tensors that we'd
                 like to perturb by adding a random noise sampled uniformly
                 random from an L_infinity ball with a radius `perturb_radius`.
 
@@ -39,7 +39,7 @@ def default_perturb_func(
 
     Returns:
 
-        perturbed_input (tuple of tensor): A list of perturbed inputs that
+        perturbed_input (tuple of Tensor): A list of perturbed inputs that
                 are created by adding noise sampled uniformly random
                 from L_infiniy ball with a radius `perturb_radius` to the
                 original inputs.
@@ -108,7 +108,7 @@ def sensitivity_max(
                 attribution algorithm or any other explanation method
                 that returns the explanations.
 
-        inputs (tensor or tuple of tensors): Input for which
+        inputs (Tensor or tuple of Tensor): Input for which
                 explanations are computed. If `explanation_func` takes a
                 single tensor as input, a single input tensor should
                 be provided.
@@ -178,7 +178,7 @@ def sensitivity_max(
 
     Returns:
 
-        sensitivities (tensor): A tensor of scalar sensitivity scores per
+        sensitivities (Tensor): A tensor of scalar sensitivity scores per
                input example. The first dimension is equal to the
                number of examples in the input batch and the second
                dimension is one. Returned sensitivities are normalized by
diff --git a/captum/metrics/_utils/batching.py b/captum/metrics/_utils/batching.py
index c906307c09..83a773bda3 100644
--- a/captum/metrics/_utils/batching.py
+++ b/captum/metrics/_utils/batching.py
@@ -38,7 +38,7 @@ def _divide_and_aggregate_metrics(
 
         Returns:
 
-            metric (tensor): A metric score estimated by `metric_func` per
+            metric (Tensor): A metric score estimated by `metric_func` per
                         input example.
     """
     bsz = inputs[0].size(0)
diff --git a/captum/robust/_core/fgsm.py b/captum/robust/_core/fgsm.py
index be6f3a474c..fe847331dc 100644
--- a/captum/robust/_core/fgsm.py
+++ b/captum/robust/_core/fgsm.py
@@ -87,7 +87,7 @@ def perturb(
 
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which adversarial
+            inputs (Tensor or tuple of Tensor): Input for which adversarial
                         attack is computed. It can be provided as a single
                         tensor or a tuple of multiple tensors. If multiple
                         input tensors are provided, the batch sizes must be
diff --git a/captum/robust/_core/perturbation.py b/captum/robust/_core/perturbation.py
index 76129d4749..3edbe351e7 100644
--- a/captum/robust/_core/perturbation.py
+++ b/captum/robust/_core/perturbation.py
@@ -18,7 +18,7 @@ class Perturbation:
 
     Args:
 
-        inputs (tensor or tuple of tensors): Input for which adversarial attack
+        inputs (Tensor or tuple of Tensor): Input for which adversarial attack
                     is computed. It can be provided as a single tensor or
                     a tuple of multiple tensors. If multiple input tensors
                     are provided, the batch sizes must be aligned across all
diff --git a/captum/robust/_core/pgd.py b/captum/robust/_core/pgd.py
index 8c22302497..d342508919 100644
--- a/captum/robust/_core/pgd.py
+++ b/captum/robust/_core/pgd.py
@@ -83,7 +83,7 @@ def perturb(
 
         Args:
 
-            inputs (tensor or tuple of tensors): Input for which adversarial
+            inputs (Tensor or tuple of Tensor): Input for which adversarial
                         attack is computed. It can be provided as a single
                         tensor or a tuple of multiple tensors. If multiple
                         input tensors are provided, the batch sizes must be

From 8752c8525ffd3d89352c76701aafae1e5fe581ee Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 1 Sep 2022 15:54:53 -0600
Subject: [PATCH 78/84] torch.Tensor -> Tensor

---
 captum/attr/_utils/attribution.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index ae7ed147fe..29158bcfdf 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -393,7 +393,7 @@ def interpolate(
 
         Args:
 
-            layer_attribution (torch.Tensor): Tensor of given layer attributions.
+            layer_attribution (Tensor): Tensor of given layer attributions.
             interpolate_dims (int or tuple): Upsampled dimensions. The
                         number of elements must be the number of dimensions
                         of layer_attribution - 2, since the first dimension

From a99ff470a1cff4b64579a29f940153e0592885d6 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 1 Sep 2022 16:46:03 -0600
Subject: [PATCH 79/84] Add temp code for testing

---
 scripts/install_via_conda.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/scripts/install_via_conda.sh b/scripts/install_via_conda.sh
index 88a9603ade..2e12ebd9a9 100755
--- a/scripts/install_via_conda.sh
+++ b/scripts/install_via_conda.sh
@@ -25,6 +25,15 @@ conda install -y conda-build
 conda install -n base conda-libmamba-solver
 conda config --set experimental_solver libmamba
 
+OUTPUT=$(conda info)
+echo "${OUTPUT}"
+
+OUTPUT=$(conda list --show-channel-urls)
+echo "${OUTPUT}"
+
+OUTPUT=$(conda config --show-sources)
+echo "${OUTPUT}"
+
 # install other frameworks if asked for and make sure this is before pytorch
 if [[ $FRAMEWORKS == true ]]; then
   pip install pytext-nlp

From a2d1678b05437d94342d8b3baddd28023bc311c2 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 1 Sep 2022 17:00:12 -0600
Subject: [PATCH 80/84] Revert change

---
 scripts/install_via_conda.sh | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/scripts/install_via_conda.sh b/scripts/install_via_conda.sh
index 2e12ebd9a9..88a9603ade 100755
--- a/scripts/install_via_conda.sh
+++ b/scripts/install_via_conda.sh
@@ -25,15 +25,6 @@ conda install -y conda-build
 conda install -n base conda-libmamba-solver
 conda config --set experimental_solver libmamba
 
-OUTPUT=$(conda info)
-echo "${OUTPUT}"
-
-OUTPUT=$(conda list --show-channel-urls)
-echo "${OUTPUT}"
-
-OUTPUT=$(conda config --show-sources)
-echo "${OUTPUT}"
-
 # install other frameworks if asked for and make sure this is before pytorch
 if [[ $FRAMEWORKS == true ]]; then
   pip install pytext-nlp

From 14d1af59b1afb75f5c5339ad22a618c997271494 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Thu, 1 Sep 2022 18:44:23 -0600
Subject: [PATCH 81/84] Remove approx methods from index.rst

---
 sphinx/source/index.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sphinx/source/index.rst b/sphinx/source/index.rst
index aa67ab6b54..c54d99c28c 100644
--- a/sphinx/source/index.rst
+++ b/sphinx/source/index.rst
@@ -21,7 +21,6 @@ Captum API Reference
    influence
    utilities
    base_classes
-   approximation_methods
 
 .. toctree::
    :maxdepth: 2

From ee6825991785d8d9fb15997ab56bbb2dc44e2f57 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Fri, 2 Sep 2022 11:36:29 -0600
Subject: [PATCH 82/84] Add Tensor to autodoc_process_docstring

---
 sphinx/source/conf.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sphinx/source/conf.py b/sphinx/source/conf.py
index 993751bb91..b01d1c8b81 100644
--- a/sphinx/source/conf.py
+++ b/sphinx/source/conf.py
@@ -245,6 +245,9 @@ def autodoc_process_docstring(
         lines[i] = re.sub(_rt[0] + r"Iterator" + _rt[1], "~typing.Iterator", lines[i])
         lines[i] = re.sub(_rt[0] + r"Iterable" + _rt[1], "~typing.Iterable", lines[i])
 
+        # Ensure Tensor type is hyperlinked by interpshinx
+        lines[i] = re.sub(_rt[0] + r"Tensor" + _rt[1], "~torch.Tensor", lines[i])
+
 
 def setup(app) -> None:
     app.connect("autodoc-process-docstring", autodoc_process_docstring)

From 6abb35c63a21f65b51bbed557b66db4141f97199 Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Fri, 2 Sep 2022 12:12:10 -0600
Subject: [PATCH 83/84] Return Types: tensor & tensors -> Tensor

* list of list of Concept -> list[list[Concept]]

* Return Types: tensor & tensors -> Tensor
---
 captum/attr/_core/deep_lift.py                |  8 ++++----
 captum/attr/_core/feature_ablation.py         |  4 ++--
 captum/attr/_core/feature_permutation.py      |  4 ++--
 captum/attr/_core/gradient_shap.py            |  4 ++--
 .../attr/_core/guided_backprop_deconvnet.py   |  8 ++++----
 captum/attr/_core/guided_grad_cam.py          |  4 ++--
 captum/attr/_core/input_x_gradient.py         |  4 ++--
 captum/attr/_core/integrated_gradients.py     |  4 ++--
 captum/attr/_core/kernel_shap.py              |  4 ++--
 captum/attr/_core/layer/grad_cam.py           |  4 ++--
 captum/attr/_core/layer/internal_influence.py |  4 ++--
 captum/attr/_core/layer/layer_activation.py   |  4 ++--
 captum/attr/_core/layer/layer_conductance.py  |  4 ++--
 captum/attr/_core/layer/layer_deep_lift.py    |  8 ++++----
 .../_core/layer/layer_feature_ablation.py     |  4 ++--
 .../attr/_core/layer/layer_gradient_shap.py   |  4 ++--
 .../layer/layer_gradient_x_activation.py      |  4 ++--
 .../_core/layer/layer_integrated_gradients.py |  6 +++---
 captum/attr/_core/layer/layer_lrp.py          |  6 +++---
 captum/attr/_core/lime.py                     |  4 ++--
 captum/attr/_core/lrp.py                      |  8 ++++----
 .../attr/_core/neuron/neuron_conductance.py   |  4 ++--
 captum/attr/_core/neuron/neuron_deep_lift.py  |  4 ++--
 .../_core/neuron/neuron_feature_ablation.py   |  4 ++--
 captum/attr/_core/neuron/neuron_gradient.py   |  4 ++--
 .../attr/_core/neuron/neuron_gradient_shap.py |  2 +-
 .../neuron_guided_backprop_deconvnet.py       |  8 ++++----
 .../neuron/neuron_integrated_gradients.py     |  4 ++--
 captum/attr/_core/noise_tunnel.py             |  2 +-
 captum/attr/_core/occlusion.py                |  4 ++--
 captum/attr/_core/saliency.py                 |  4 ++--
 captum/attr/_core/shapley_value.py            |  8 ++++----
 captum/attr/_utils/attribution.py             | 20 +++++++++----------
 captum/concept/_core/tcav.py                  |  4 ++--
 captum/robust/_core/fgsm.py                   |  2 +-
 captum/robust/_core/perturbation.py           |  2 +-
 captum/robust/_core/pgd.py                    |  2 +-
 37 files changed, 91 insertions(+), 91 deletions(-)

diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 425aa2a2ee..ea059d7fcc 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -288,7 +288,7 @@ def attribute(  # type: ignore
 
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                 Attribution score computed based on DeepLift rescale rule with respect
                 to each input feature. Attributions will always be
                 the same size as the provided inputs, with each value
@@ -296,7 +296,7 @@ def attribute(  # type: ignore
                 If a single tensor is provided as inputs, a single tensor is
                 returned. If a tuple is provided for inputs, a tuple of
                 corresponding sized tensors is returned.
-            - **delta** (*tensor*, returned if return_convergence_delta=True):
+            - **delta** (*Tensor*, returned if return_convergence_delta=True):
                 This is computed using the property that
                 the total sum of forward_func(inputs) - forward_func(baselines)
                 must equal the total sum of the attributions computed
@@ -791,7 +791,7 @@ def attribute(  # type: ignore
 
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Attribution score computed based on DeepLift rescale rule with
                         respect to each input feature. Attributions will always be
                         the same size as the provided inputs, with each value
@@ -799,7 +799,7 @@ def attribute(  # type: ignore
                         If a single tensor is provided as inputs, a single tensor is
                         returned. If a tuple is provided for inputs, a tuple of
                         corresponding sized tensors is returned.
-            - **delta** (*tensor*, returned if return_convergence_delta=True):
+            - **delta** (*Tensor*, returned if return_convergence_delta=True):
                         This is computed using the property that the
                         total sum of forward_func(inputs) - forward_func(baselines)
                         must be very close to the total sum of attributions
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index 862ce085c4..70de13e81c 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -194,8 +194,8 @@ def attribute(
                         Default: None
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         The attributions with respect to each input feature.
                         If the forward function returns
                         a scalar value per example, attributions will be
diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
index 87b270799b..9aac4c11a1 100644
--- a/captum/attr/_core/feature_permutation.py
+++ b/captum/attr/_core/feature_permutation.py
@@ -204,8 +204,8 @@ def attribute(  # type: ignore
                             Default: None
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         The attributions with respect to each input feature.
                         If the forward function returns
                         a scalar value per example, attributions will be
diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 4db185eeda..f6ec8da302 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -215,7 +215,7 @@ def attribute(
                         Default: False
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Attribution score computed based on GradientSHAP with respect
                         to each input feature. Attributions will always be
                         the same size as the provided inputs, with each value
@@ -223,7 +223,7 @@ def attribute(
                         If a single tensor is provided as inputs, a single tensor is
                         returned. If a tuple is provided for inputs, a tuple of
                         corresponding sized tensors is returned.
-            - **delta** (*tensor*, returned if return_convergence_delta=True):
+            - **delta** (*Tensor*, returned if return_convergence_delta=True):
                         This is computed using the property that the total
                         sum of forward_func(inputs) - forward_func(baselines)
                         must be very close to the total sum of the attributions
diff --git a/captum/attr/_core/guided_backprop_deconvnet.py b/captum/attr/_core/guided_backprop_deconvnet.py
index 21dc3154a2..ba2c2114c5 100644
--- a/captum/attr/_core/guided_backprop_deconvnet.py
+++ b/captum/attr/_core/guided_backprop_deconvnet.py
@@ -186,8 +186,8 @@ def attribute(
                         Default: None
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         The guided backprop gradients with respect to each
                         input feature. Attributions will always
                         be the same size as the provided inputs, with each value
@@ -297,8 +297,8 @@ def attribute(
                         Default: None
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         The deconvolution attributions with respect to each
                         input feature. Attributions will always
                         be the same size as the provided inputs, with each value
diff --git a/captum/attr/_core/guided_grad_cam.py b/captum/attr/_core/guided_grad_cam.py
index 01e53e9a1f..3c7478bae8 100644
--- a/captum/attr/_core/guided_grad_cam.py
+++ b/captum/attr/_core/guided_grad_cam.py
@@ -151,8 +151,8 @@ def attribute(
                         Default: False
 
         Returns:
-            *tensor* of **attributions**:
-            - **attributions** (*tensor*):
+            *Tensor* of **attributions**:
+            - **attributions** (*Tensor*):
                     Element-wise product of (upsampled) GradCAM
                     and Guided Backprop attributions.
                     If a single tensor is provided as inputs, a single tensor is
diff --git a/captum/attr/_core/input_x_gradient.py b/captum/attr/_core/input_x_gradient.py
index a2a9978032..fcf1d85025 100644
--- a/captum/attr/_core/input_x_gradient.py
+++ b/captum/attr/_core/input_x_gradient.py
@@ -84,8 +84,8 @@ def attribute(
                         Default: None
 
         Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
+                *Tensor* or tuple of *Tensor* of **attributions**:
+                - **attributions** (*Tensor* or tuple of *Tensor*):
                             The input x gradient with
                             respect to each input feature. Attributions will always be
                             the same size as the provided inputs, with each value
diff --git a/captum/attr/_core/integrated_gradients.py b/captum/attr/_core/integrated_gradients.py
index 7b16b58466..04896fac61 100644
--- a/captum/attr/_core/integrated_gradients.py
+++ b/captum/attr/_core/integrated_gradients.py
@@ -233,7 +233,7 @@ def attribute(  # type: ignore
                     Default: False
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                     Integrated gradients with respect to each input feature.
                     attributions will always be the same size as the provided
                     inputs, with each value providing the attribution of the
@@ -241,7 +241,7 @@ def attribute(  # type: ignore
                     If a single tensor is provided as inputs, a single tensor is
                     returned. If a tuple is provided for inputs, a tuple of
                     corresponding sized tensors is returned.
-            - **delta** (*tensor*, returned if return_convergence_delta=True):
+            - **delta** (*Tensor*, returned if return_convergence_delta=True):
                     The difference between the total approximated and true
                     integrated gradients. This is computed using the property
                     that the total sum of forward_func(inputs) -
diff --git a/captum/attr/_core/kernel_shap.py b/captum/attr/_core/kernel_shap.py
index bf95182747..12da6991dc 100644
--- a/captum/attr/_core/kernel_shap.py
+++ b/captum/attr/_core/kernel_shap.py
@@ -220,8 +220,8 @@ def attribute(  # type: ignore
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         The attributions with respect to each input feature.
                         If return_input_shape = True, attributions will be
                         the same size as the provided inputs, with each value
diff --git a/captum/attr/_core/layer/grad_cam.py b/captum/attr/_core/layer/grad_cam.py
index f848ff9b28..bcbcb02af7 100644
--- a/captum/attr/_core/layer/grad_cam.py
+++ b/captum/attr/_core/layer/grad_cam.py
@@ -151,8 +151,8 @@ def attribute(
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Attributions based on GradCAM method.
                         Attributions will be the same size as the
                         output of the given layer, except for dimension 2,
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index bc40893fd0..46aba1ff61 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -187,8 +187,8 @@ def attribute(
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-              - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+              - **attributions** (*Tensor* or tuple of *Tensor*):
                         Internal influence of each neuron in given
                         layer output. Attributions will always be the same size
                         as the output or input of the given layer depending on
diff --git a/captum/attr/_core/layer/layer_activation.py b/captum/attr/_core/layer/layer_activation.py
index 3c5e63d70b..c4244e5966 100644
--- a/captum/attr/_core/layer/layer_activation.py
+++ b/captum/attr/_core/layer/layer_activation.py
@@ -87,8 +87,8 @@ def attribute(
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* or list of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors* or *list*):
+            *Tensor* or tuple of *Tensor* or list of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor* or *list*):
                         Activation of each neuron in given layer output.
                         Attributions will always be the same size as the
                         output of the given layer.
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index f587690572..b8d9bc563f 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -235,7 +235,7 @@ def attribute(
 
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Conductance of each neuron in given layer input or
                         output. Attributions will always be the same size as
                         the input or output of the given layer, depending on
@@ -245,7 +245,7 @@ def attribute(
                         Attributions are returned in a tuple if
                         the layer inputs / outputs contain multiple tensors,
                         otherwise a single tensor is returned.
-            - **delta** (*tensor*, returned if return_convergence_delta=True):
+            - **delta** (*Tensor*, returned if return_convergence_delta=True):
                         The difference between the total
                         approximated and true conductance.
                         This is computed using the property that the total sum of
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index aa31d51b38..362f250170 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -256,7 +256,7 @@ def attribute(
 
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                 Attribution score computed based on DeepLift's rescale rule with
                 respect to layer's inputs or outputs. Attributions will always be the
                 same size as the provided layer's inputs or outputs, depending on
@@ -265,7 +265,7 @@ def attribute(
                 just a tensor is returned; if the layer input / output
                 has multiple tensors, then a corresponding tuple
                 of tensors is returned.
-            - **delta** (*tensor*, returned if return_convergence_delta=True):
+            - **delta** (*Tensor*, returned if return_convergence_delta=True):
                 This is computed using the property that the total sum of
                 forward_func(inputs) - forward_func(baselines) must equal the
                 total sum of the attributions computed based on DeepLift's
@@ -587,7 +587,7 @@ def attribute(
 
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Attribution score computed based on DeepLift's rescale rule
                         with respect to layer's inputs or outputs. Attributions
                         will always be the same size as the provided layer's inputs
@@ -598,7 +598,7 @@ def attribute(
                         from a forward hook. For standard modules, inputs of
                         a single tensor are usually wrapped in a tuple, while
                         outputs of a single tensor are not.
-            - **delta** (*tensor*, returned if return_convergence_delta=True):
+            - **delta** (*Tensor*, returned if return_convergence_delta=True):
                         This is computed using the property that the
                         total sum of forward_func(inputs) - forward_func(baselines)
                         must be very close to the total sum of attributions
diff --git a/captum/attr/_core/layer/layer_feature_ablation.py b/captum/attr/_core/layer/layer_feature_ablation.py
index 80e22d154f..ee7df14ff7 100644
--- a/captum/attr/_core/layer/layer_feature_ablation.py
+++ b/captum/attr/_core/layer/layer_feature_ablation.py
@@ -171,8 +171,8 @@ def attribute(
                         Default: 1
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Attribution of each neuron in given layer input or
                         output. Attributions will always be the same size as
                         the input or output of the given layer, depending on
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index 4f0cdbe63e..b6dfda9106 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -246,7 +246,7 @@ def attribute(
                         Default: False
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Attribution score computed based on GradientSHAP with
                         respect to layer's input or output. Attributions will always
                         be the same size as the provided layer's inputs or outputs,
@@ -255,7 +255,7 @@ def attribute(
                         Attributions are returned in a tuple if
                         the layer inputs / outputs contain multiple tensors,
                         otherwise a single tensor is returned.
-            - **delta** (*tensor*, returned if return_convergence_delta=True):
+            - **delta** (*Tensor*, returned if return_convergence_delta=True):
                         This is computed using the property that the total
                         sum of forward_func(inputs) - forward_func(baselines)
                         must be very close to the total sum of the attributions
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index 5de6e6a78b..385a1491c4 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -134,8 +134,8 @@ def attribute(
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* or list of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors* or *list*):
+            *Tensor* or tuple of *Tensor* or list of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor* or *list*):
                         Product of gradient and activation for each
                         neuron in given layer output.
                         Attributions will always be the same size as the
diff --git a/captum/attr/_core/layer/layer_integrated_gradients.py b/captum/attr/_core/layer/layer_integrated_gradients.py
index 9c259efd2d..d67f52cad5 100644
--- a/captum/attr/_core/layer/layer_integrated_gradients.py
+++ b/captum/attr/_core/layer/layer_integrated_gradients.py
@@ -310,8 +310,8 @@ def attribute(
             Returns:
                 **attributions** or 2-element tuple of **attributions**, **delta**:
 
-                  - **attributions** (*tensor*, tuple of *tensors* or tuple of
-                  *tensors*): Integrated gradients with respect to `layer`'s inputs
+                  - **attributions** (*Tensor*, tuple of *Tensor* or tuple of
+                  *Tensor*): Integrated gradients with respect to `layer`'s inputs
                         or outputs. Attributions will always be the same size and
                         dimensionality as the input or output of the given layer,
                         depending on whether we attribute to the inputs or outputs
@@ -330,7 +330,7 @@ def attribute(
                         a tuple of tensors. The ordering of the outputs will be
                         the same order as the layers given in the constructor.
 
-                  - **delta** (*tensor*, returned if return_convergence_delta=True):
+                  - **delta** (*Tensor*, returned if return_convergence_delta=True):
                         The difference between the total approximated and true
                         integrated gradients. This is computed using the property
                         that the total sum of forward_func(inputs) -
diff --git a/captum/attr/_core/layer/layer_lrp.py b/captum/attr/_core/layer/layer_lrp.py
index 475a534025..613c41b962 100644
--- a/captum/attr/_core/layer/layer_lrp.py
+++ b/captum/attr/_core/layer/layer_lrp.py
@@ -175,10 +175,10 @@ def attribute(
                     Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions** or 2-element tuple of
+            *Tensor* or tuple of *Tensor* of **attributions** or 2-element tuple of
             **attributions**, **delta** or list of **attributions** and **delta**:
 
-              - **attributions** (*tensor* or tuple of *tensors*):
+              - **attributions** (*Tensor* or tuple of *Tensor*):
                         The propagated relevance values with respect to each
                         input feature. Attributions will always
                         be the same size as the provided inputs, with each value
@@ -190,7 +190,7 @@ def attribute(
                         implementations. If attributions for all layers are returned
                         (layer=None) a list of tensors or tuples of tensors is returned
                         with entries for each layer.
-              - **delta** (*tensor* or list of *tensors*
+              - **delta** (*Tensor* or list of *Tensor*
                         returned if return_convergence_delta=True):
                         Delta is calculated per example, meaning that the number of
                         elements in returned delta tensor is equal to the number of
diff --git a/captum/attr/_core/lime.py b/captum/attr/_core/lime.py
index 87cf5fee8f..1f94bb9cb2 100644
--- a/captum/attr/_core/lime.py
+++ b/captum/attr/_core/lime.py
@@ -1013,8 +1013,8 @@ def attribute(  # type: ignore
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         The attributions with respect to each input feature.
                         If return_input_shape = True, attributions will be
                         the same size as the provided inputs, with each value
diff --git a/captum/attr/_core/lrp.py b/captum/attr/_core/lrp.py
index 5e32225382..d557f0ce20 100644
--- a/captum/attr/_core/lrp.py
+++ b/captum/attr/_core/lrp.py
@@ -155,10 +155,10 @@ def attribute(
                     of rules is printed during propagation.
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**
+            *Tensor* or tuple of *Tensor* of **attributions**
             or 2-element tuple of **attributions**, **delta**:
 
-              - **attributions** (*tensor* or tuple of *tensors*):
+              - **attributions** (*Tensor* or tuple of *Tensor*):
                         The propagated relevance values with respect to each
                         input feature. The values are normalized by the output score
                         value (sum(relevance)=1). To obtain values comparable to other
@@ -172,7 +172,7 @@ def attribute(
                         is one and not corresponding to the prediction score as in other
                         implementations.
 
-              - **delta** (*tensor*, returned if return_convergence_delta=True):
+              - **delta** (*Tensor*, returned if return_convergence_delta=True):
                         Delta is calculated per example, meaning that the number of
                         elements in returned delta tensor is equal to the number of
                         of examples in the inputs.
@@ -260,7 +260,7 @@ def compute_convergence_delta(
                         should only have a single element.
 
         Returns:
-            *tensor*:
+            *Tensor*:
             - **delta** Difference of relevance in output layer and input layer.
         """
         if isinstance(attributions, tuple):
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index 135b44453f..004d941cb9 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -244,8 +244,8 @@ def attribute(
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Conductance for
                         particular neuron with respect to each input feature.
                         Attributions will always be the same size as the provided
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index 97a0dc72db..d486bdea51 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -207,7 +207,7 @@ def attribute(
 
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                 Computes attributions using Deeplift's rescale rule for
                 particular neuron with respect to each input feature.
                 Attributions will always be the same size as the provided
@@ -445,7 +445,7 @@ def attribute(
 
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Computes attributions using Deeplift's rescale rule for
                         particular neuron with respect to each input feature.
                         Attributions will always be the same size as the provided
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index 3b996ffb6f..8ee73197da 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -189,8 +189,8 @@ def attribute(
                         Default: 1
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Attributions of particular neuron with respect to each input
                         feature. Attributions will always be the same size as the
                         provided inputs, with each value providing the attribution
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index 76b2c30431..d948dfee1a 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -130,8 +130,8 @@ def attribute(
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Gradients of particular neuron with respect to each input
                         feature. Attributions will always be the same size as the
                         provided inputs, with each value providing the attribution
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index a142523784..338949352e 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -209,7 +209,7 @@ def attribute(
 
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Attribution score computed based on GradientSHAP with respect
                         to each input feature. Attributions will always be
                         the same size as the provided inputs, with each value
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 7b0ea844af..b9a5e80b7f 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -134,8 +134,8 @@ def attribute(
                         Support for multiple tensors will be added later.
                         Default: False
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Deconvolution attribution of
                         particular neuron with respect to each input feature.
                         Attributions will always be the same size as the provided
@@ -303,8 +303,8 @@ def attribute(
                         Support for multiple tensors will be added later.
                         Default: False
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Guided backprop attribution of
                         particular neuron with respect to each input feature.
                         Attributions will always be the same size as the provided
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index ebf80a7241..2afc17180f 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -202,8 +202,8 @@ def attribute(
                         Default: False
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Integrated gradients for particular neuron with
                         respect to each input feature.
                         Attributions will always be the same size as the provided
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 5fbb01378a..eda936a048 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -139,7 +139,7 @@ def attribute(
 
         Returns:
             **attributions** or 2-element tuple of **attributions**, **delta**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         Attribution with
                         respect to each input feature. attributions will always be
                         the same size as the provided inputs, with each value
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index b7e03f5cb4..fedc2dae05 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -187,8 +187,8 @@ def attribute(  # type: ignore
                             Default: False
 
         Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
+                *Tensor* or tuple of *Tensor* of **attributions**:
+                - **attributions** (*Tensor* or tuple of *Tensor*):
                             The attributions with respect to each input feature.
                             Attributions will always be
                             the same size as the provided inputs, with each value
diff --git a/captum/attr/_core/saliency.py b/captum/attr/_core/saliency.py
index 91a65e2733..505c35b28e 100644
--- a/captum/attr/_core/saliency.py
+++ b/captum/attr/_core/saliency.py
@@ -95,8 +95,8 @@ def attribute(
                         Default: None
 
         Returns:
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                         The gradients with respect to each input feature.
                         Attributions will always be
                         the same size as the provided inputs, with each value
diff --git a/captum/attr/_core/shapley_value.py b/captum/attr/_core/shapley_value.py
index 32d3bf0861..4d5f244816 100644
--- a/captum/attr/_core/shapley_value.py
+++ b/captum/attr/_core/shapley_value.py
@@ -219,8 +219,8 @@ def attribute(
                             Default: False
 
         Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
+                *Tensor* or tuple of *Tensor* of **attributions**:
+                - **attributions** (*Tensor* or tuple of *Tensor*):
                             The attributions with respect to each input feature.
                             If the forward function returns
                             a scalar value per example, attributions will be
@@ -668,8 +668,8 @@ def attribute(
                             a simple output of progress.
                             Default: False
         Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
+                *Tensor* or tuple of *Tensor* of **attributions**:
+                - **attributions** (*Tensor* or tuple of *Tensor*):
                             The attributions with respect to each input feature.
                             If the forward function returns
                             a scalar value per example, attributions will be
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index 29158bcfdf..fed579eb92 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -56,8 +56,8 @@ def __init__(self, forward_func: Callable) -> None:
 
     Returns:
 
-        *tensor* or tuple of *tensors* of **attributions**:
-        - **attributions** (*tensor* or tuple of *tensors*):
+        *Tensor* or tuple of *Tensor* of **attributions**:
+        - **attributions** (*Tensor* or tuple of *Tensor*):
                     Attribution values for each
                     input tensor. The `attributions` have the same shape and
                     dimensionality as the inputs.
@@ -110,8 +110,8 @@ def has_convergence_delta(self) -> bool:
 
     Returns:
 
-            *tensor* of **deltas**:
-            - **deltas** (*tensor*):
+            *Tensor* of **deltas**:
+            - **deltas** (*Tensor*):
                 Depending on specific implementaion of
                 sub-classes, convergence delta can be returned per
                 sample in form of a tensor or it can be aggregated
@@ -245,8 +245,8 @@ def compute_convergence_delta(
 
         Returns:
 
-                *tensor* of **deltas**:
-                - **deltas** (*tensor*):
+                *Tensor* of **deltas**:
+                - **deltas** (*Tensor*):
                     This implementation returns convergence delta per
                     sample. Deriving sub-classes may do any type of aggregation
                     of those values, if necessary.
@@ -408,8 +408,8 @@ def interpolate(
                         attribution.
 
         Returns:
-            *tensor* of upsampled **attributions**:
-            - **attributions** (*tensor*):
+            *Tensor* of upsampled **attributions**:
+            - **attributions** (*Tensor*):
                 Upsampled layer attributions with first 2 dimensions matching
                 slayer_attribution and remaining dimensions given by
                 interpolate_dims.
@@ -470,8 +470,8 @@ def __init__(
 
     Returns:
 
-            *tensor* or tuple of *tensors* of **attributions**:
-            - **attributions** (*tensor* or tuple of *tensors*):
+            *Tensor* or tuple of *Tensor* of **attributions**:
+            - **attributions** (*Tensor* or tuple of *Tensor*):
                     Attribution values for
                     each input vector. The `attributions` have the
                     dimensionality of inputs.
diff --git a/captum/concept/_core/tcav.py b/captum/concept/_core/tcav.py
index 73b3c758bb..64977901ef 100644
--- a/captum/concept/_core/tcav.py
+++ b/captum/concept/_core/tcav.py
@@ -465,7 +465,7 @@ def compute_cavs(
 
         Args:
 
-            experimental_sets (list of list of Concept): A list of lists of concept
+            experimental_sets (list[list[Concept]]): A list of lists of concept
                     instances for which the cavs will be computed.
             force_train (bool, optional): A flag that indicates whether to
                     train the CAVs regardless of whether they are saved or not.
@@ -588,7 +588,7 @@ def interpret(
                     dimension 0 corresponds to the number of examples
                     (aka batch size), and if multiple input tensors are
                     provided, the examples must be aligned appropriately.
-            experimental_sets (list of list of Concept): A list of list of Concept
+            experimental_sets (list[list[Concept]]): A list of list of Concept
                     instances.
             target (int, tuple, Tensor, or list, optional): Output indices for
                     which attributions are computed (for classification cases,
diff --git a/captum/robust/_core/fgsm.py b/captum/robust/_core/fgsm.py
index fe847331dc..0e42d08c37 100644
--- a/captum/robust/_core/fgsm.py
+++ b/captum/robust/_core/fgsm.py
@@ -132,7 +132,7 @@ def perturb(
 
         Returns:
 
-            - **perturbed inputs** (*tensor* or tuple of *tensors*):
+            - **perturbed inputs** (*Tensor* or tuple of *Tensor*):
                         Perturbed input for each
                         input tensor. The perturbed inputs have the same shape and
                         dimensionality as the inputs.
diff --git a/captum/robust/_core/perturbation.py b/captum/robust/_core/perturbation.py
index 3edbe351e7..c47b02dd78 100644
--- a/captum/robust/_core/perturbation.py
+++ b/captum/robust/_core/perturbation.py
@@ -26,7 +26,7 @@ class Perturbation:
 
     Returns:
 
-        - **perturbed inputs** (*tensor* or tuple of *tensors*):
+        - **perturbed inputs** (*Tensor* or tuple of *Tensor*):
                     Perturbed input for each
                     input tensor. The perturbed inputs have the same shape and
                     dimensionality as the inputs.
diff --git a/captum/robust/_core/pgd.py b/captum/robust/_core/pgd.py
index d342508919..733cbcc488 100644
--- a/captum/robust/_core/pgd.py
+++ b/captum/robust/_core/pgd.py
@@ -135,7 +135,7 @@ def perturb(
 
         Returns:
 
-            - **perturbed inputs** (*tensor* or tuple of *tensors*):
+            - **perturbed inputs** (*Tensor* or tuple of *Tensor*):
                         Perturbed input for each
                         input tensor. The perturbed inputs have the same shape and
                         dimensionality as the inputs.

From da0fab9e9d614bb4e3acc0cb011a7e072a69cf8f Mon Sep 17 00:00:00 2001
From: ProGamerGov <ProGamerGov@users.noreply.github.com>
Date: Fri, 16 Sep 2022 11:59:31 -0600
Subject: [PATCH 84/84] Fix newly introduced Mypy error

---
 tests/influence/_core/test_tracin_intermediate_quantities.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/influence/_core/test_tracin_intermediate_quantities.py b/tests/influence/_core/test_tracin_intermediate_quantities.py
index 7f3e806c28..9f0daebad3 100644
--- a/tests/influence/_core/test_tracin_intermediate_quantities.py
+++ b/tests/influence/_core/test_tracin_intermediate_quantities.py
@@ -179,7 +179,7 @@ def test_tracin_intermediate_quantities_consistent(
             else:
                 # `test_features` is a tuple, so we unpack it to place in tuple,
                 # along with `test_labels`
-                test_batch = (*test_features, test_labels)
+                test_batch = (*test_features, test_labels)  # type: ignore[assignment]
 
             # the influence score is the dot product of intermediate quantities
             intermediate_quantities_scores = torch.matmul(