From e3cd8e9dd3508357e68f02c1b391634fe3678751 Mon Sep 17 00:00:00 2001
From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com>
Date: Wed, 8 Dec 2021 11:45:46 -0500
Subject: [PATCH] [*.ipynb] blacken

---
 docs/tutorials/_template.ipynb                |  589 ++---
 .../average_optimizers_callback.ipynb         |  787 +++---
 docs/tutorials/image_ops.ipynb                |  800 +++---
 docs/tutorials/layers_normalizations.ipynb    |  656 ++---
 .../layers_weightnormalization.ipynb          |  635 ++---
 docs/tutorials/losses_triplet.ipynb           |  661 ++---
 docs/tutorials/networks_seq2seq_nmt.ipynb     | 2266 +++++++++--------
 .../optimizers_conditionalgradient.ipynb      |  803 +++---
 .../optimizers_cyclicallearningrate.ipynb     |  907 +++----
 docs/tutorials/optimizers_lazyadam.ipynb      |  491 ++--
 docs/tutorials/time_stopping.ipynb            |  407 +--
 docs/tutorials/tqdm_progress_bar.ipynb        |  523 ++--
 12 files changed, 4854 insertions(+), 4671 deletions(-)
diff --git a/docs/tutorials/_template.ipynb b/docs/tutorials/_template.ipynb
index 07994386fd..f3b1de9c3c 100644
--- a/docs/tutorials/_template.ipynb
+++ b/docs/tutorials/_template.ipynb
@@ -1,296 +1,299 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Tce3stUlHN0L"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "tuOe1ymfHZPu"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qFdPvlXBOdUN"
-      },
-      "source": [
-        "# Title"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MfBg1C5NB3X0"
-      },
-      "source": [
-        "<table class=\"tfa-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/_template\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/_template.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/_template.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "      <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/_template.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "r6P32iYYV27b"
-      },
-      "source": [
-        "[Update button links]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xHxb-dlhMIzW"
-      },
-      "source": [
-        "## Overview\n",
-        "\n",
-        "[Include a paragraph or two explaining what this example demonstrates, who should be interested in it, and what you need to know before you get started.]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MUXex9ctTuDB"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1Eh-iCRVBm0p"
-      },
-      "source": [
-        "[Put all your imports and installs up into a setup section.]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rEk-ibQkDNtF"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -U tensorflow-addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "IqR2PQG4ZaZ0"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "import tensorflow_addons as tfa"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UhNtHfuxCGVy"
-      },
-      "source": [
-        "## Resources\n",
-        "\n",
-        "* [TensorFlow documentation contributor guide](https://www.tensorflow.org/community/contribute/docs)\n",
-        "* [TensorFlow documentation style guide](https://www.tensorflow.org/community/contribute/docs_style)\n",
-        "* [Google developer documentation style guide](https://developers.google.com/style/highlights)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2V22fKegUtF9"
-      },
-      "source": [
-        "## Notebook style\n",
-        "\n",
-        "* Include the collapsed license at the top (uses the Colab \"Form\" mode to hide the cells).\n",
-        "* Save the notebook with the table of contents open.\n",
-        "* Use one `H1` header for the title.\n",
-        "* Include the button-bar immediately after the `H1`.\n",
-        "* Include an overview section before any code.\n",
-        "* Put all your installs and imports in a setup section.\n",
-        "* Write Python 3 compatible code. You don't have to worry about Python 2 compatibility.\n",
-        "* Keep code and text cells as brief as possible.\n",
-        "* Avoid leaving an empty cell at the end of the notebook."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "QKp40qS-DGEZ"
-      },
-      "source": [
-        "### Code style\n",
-        "\n",
-        "* Notebooks are for people. Write code optimized for clarity.\n",
-        "* Keep examples quick. Use small datasets, or small slices of datasets. Don't train to convergence, train until it's obvious it's making progress.\n",
-        "* Demonstrate small parts before combining them into something more complex, like this:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "KtylpxOmceaC"
-      },
-      "outputs": [],
-      "source": [
-        "# Build the model\n",
-        "model = tf.keras.Sequential([\n",
-        "    tf.keras.layers.Dense(10, activation='relu', input_shape=(None, 5)),\n",
-        "    tf.keras.layers.Dense(3)\n",
-        "])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pwdM2pl3RSPb"
-      },
-      "source": [
-        "Run the model on a single batch of data, and inspect the output:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mMOeXVmbdilM"
-      },
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "\n",
-        "result = model(tf.constant(np.random.randn(10,5), dtype = tf.float32)).numpy()\n",
-        "\n",
-        "print(\"min:\", result.min())\n",
-        "print(\"max:\", result.max())\n",
-        "print(\"mean:\", result.mean())\n",
-        "print(\"shape:\", result.shape)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uabQmjMtRtzs"
-      },
-      "source": [
-        "Compile the model for training:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "U82B_tH2d294"
-      },
-      "outputs": [],
-      "source": [
-        "model.compile(optimizer=tf.keras.optimizers.Adam(),\n",
-        "              loss=tf.keras.losses.categorical_crossentropy)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "TJdqBNBbS78n"
-      },
-      "source": [
-        "### Code content\n",
-        "\n",
-        "* Use the highest level API that gets the job done (unless the goal is to demonstrate the low level API).\n",
-        "* Use `keras.Sequential` > keras functional api > keras model subclassing > ...\n",
-        "* Use `model.fit` > `model.train_on_batch` > manual `GradientTapes`.\n",
-        "* Use eager-style code.\n",
-        "* Use `tensorflow_datasets` and `tf.data` where possible.\n",
-        "* Avoid `compat.v1`."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "78HBT9cQXJko"
-      },
-      "source": [
-        "### Text\n",
-        "\n",
-        "* Use an imperative style. \"Run a batch of images through the model.\"\n",
-        "* Use sentence case in titles/headings. \n",
-        "* Use short titles/headings: \"Download the data\", \"Build the Model\", \"Train the model\".\n",
-        "* Use the [Google developer documentation style guide](https://developers.google.com/style/highlights).\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "YrsKXcPRUvK9"
-      },
-      "source": [
-        "## GitHub workflow\n",
-        "\n",
-        "* Be consistent about how you save your notebooks, otherwise the JSON diffs are messy.\n",
-        "* This notebook has the \"Omit code cell output when saving this notebook\" option set. GitHub refuses to diff notebooks with large diffs (inline images).\n",
-        "* [ReviewNB.com](http://reviewnb.com) can help with diffs. This is linked in a comment on a notebook pull request.\n",
-        "* Use the [Open in Colab](https://chrome.google.com/webstore/detail/open-in-colab/iogfkhleblhcpcekbiedikdehleodpjo) extension to open a GitHub notebook in Colab.\n",
-        "* The easiest way to edit a notebook in GitHub is to open it with Colab from the branch you want to edit. Then use File --> Save a copy in GitHub, which will save it back to the branch you opened it from.\n",
-        "* For PRs it's helpful to post a direct Colab link to the PR head: https://colab.research.google.com/github/{USER}/{REPO}/blob/{BRANCH}/{PATH}.ipynb"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [
-        "Tce3stUlHN0L"
-      ],
-      "name": "_template.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Tce3stUlHN0L"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors."
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "tuOe1ymfHZPu"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qFdPvlXBOdUN"
+   },
+   "source": [
+    "# Title"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MfBg1C5NB3X0"
+   },
+   "source": [
+    "<table class=\"tfa-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/_template\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/_template.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/_template.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "      <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/_template.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "r6P32iYYV27b"
+   },
+   "source": [
+    "[Update button links]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "xHxb-dlhMIzW"
+   },
+   "source": [
+    "## Overview\n",
+    "\n",
+    "[Include a paragraph or two explaining what this example demonstrates, who should be interested in it, and what you need to know before you get started.]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MUXex9ctTuDB"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1Eh-iCRVBm0p"
+   },
+   "source": [
+    "[Put all your imports and installs up into a setup section.]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "rEk-ibQkDNtF"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U tensorflow-addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "IqR2PQG4ZaZ0"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow_addons as tfa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "UhNtHfuxCGVy"
+   },
+   "source": [
+    "## Resources\n",
+    "\n",
+    "* [TensorFlow documentation contributor guide](https://www.tensorflow.org/community/contribute/docs)\n",
+    "* [TensorFlow documentation style guide](https://www.tensorflow.org/community/contribute/docs_style)\n",
+    "* [Google developer documentation style guide](https://developers.google.com/style/highlights)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2V22fKegUtF9"
+   },
+   "source": [
+    "## Notebook style\n",
+    "\n",
+    "* Include the collapsed license at the top (uses the Colab \"Form\" mode to hide the cells).\n",
+    "* Save the notebook with the table of contents open.\n",
+    "* Use one `H1` header for the title.\n",
+    "* Include the button-bar immediately after the `H1`.\n",
+    "* Include an overview section before any code.\n",
+    "* Put all your installs and imports in a setup section.\n",
+    "* Write Python 3 compatible code. You don't have to worry about Python 2 compatibility.\n",
+    "* Keep code and text cells as brief as possible.\n",
+    "* Avoid leaving an empty cell at the end of the notebook."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QKp40qS-DGEZ"
+   },
+   "source": [
+    "### Code style\n",
+    "\n",
+    "* Notebooks are for people. Write code optimized for clarity.\n",
+    "* Keep examples quick. Use small datasets, or small slices of datasets. Don't train to convergence, train until it's obvious it's making progress.\n",
+    "* Demonstrate small parts before combining them into something more complex, like this:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "KtylpxOmceaC"
+   },
+   "outputs": [],
+   "source": [
+    "# Build the model\n",
+    "model = tf.keras.Sequential(\n",
+    "    [\n",
+    "        tf.keras.layers.Dense(10, activation=\"relu\", input_shape=(None, 5)),\n",
+    "        tf.keras.layers.Dense(3),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pwdM2pl3RSPb"
+   },
+   "source": [
+    "Run the model on a single batch of data, and inspect the output:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "mMOeXVmbdilM"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "result = model(tf.constant(np.random.randn(10, 5), dtype=tf.float32)).numpy()\n",
+    "\n",
+    "print(\"min:\", result.min())\n",
+    "print(\"max:\", result.max())\n",
+    "print(\"mean:\", result.mean())\n",
+    "print(\"shape:\", result.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uabQmjMtRtzs"
+   },
+   "source": [
+    "Compile the model for training:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "U82B_tH2d294"
+   },
+   "outputs": [],
+   "source": [
+    "model.compile(\n",
+    "    optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.categorical_crossentropy\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TJdqBNBbS78n"
+   },
+   "source": [
+    "### Code content\n",
+    "\n",
+    "* Use the highest level API that gets the job done (unless the goal is to demonstrate the low level API).\n",
+    "* Use `keras.Sequential` > keras functional api > keras model subclassing > ...\n",
+    "* Use `model.fit` > `model.train_on_batch` > manual `GradientTapes`.\n",
+    "* Use eager-style code.\n",
+    "* Use `tensorflow_datasets` and `tf.data` where possible.\n",
+    "* Avoid `compat.v1`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "78HBT9cQXJko"
+   },
+   "source": [
+    "### Text\n",
+    "\n",
+    "* Use an imperative style. \"Run a batch of images through the model.\"\n",
+    "* Use sentence case in titles/headings. \n",
+    "* Use short titles/headings: \"Download the data\", \"Build the Model\", \"Train the model\".\n",
+    "* Use the [Google developer documentation style guide](https://developers.google.com/style/highlights).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YrsKXcPRUvK9"
+   },
+   "source": [
+    "## GitHub workflow\n",
+    "\n",
+    "* Be consistent about how you save your notebooks, otherwise the JSON diffs are messy.\n",
+    "* This notebook has the \"Omit code cell output when saving this notebook\" option set. GitHub refuses to diff notebooks with large diffs (inline images).\n",
+    "* [ReviewNB.com](http://reviewnb.com) can help with diffs. This is linked in a comment on a notebook pull request.\n",
+    "* Use the [Open in Colab](https://chrome.google.com/webstore/detail/open-in-colab/iogfkhleblhcpcekbiedikdehleodpjo) extension to open a GitHub notebook in Colab.\n",
+    "* The easiest way to edit a notebook in GitHub is to open it with Colab from the branch you want to edit. Then use File --> Save a copy in GitHub, which will save it back to the branch you opened it from.\n",
+    "* For PRs it's helpful to post a direct Colab link to the PR head: https://colab.research.google.com/github/{USER}/{REPO}/blob/{BRANCH}/{PATH}.ipynb"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "collapsed_sections": [
+    "Tce3stUlHN0L"
+   ],
+   "name": "_template.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/average_optimizers_callback.ipynb b/docs/tutorials/average_optimizers_callback.ipynb
index ecf0ca4646..5cef82688e 100644
--- a/docs/tutorials/average_optimizers_callback.ipynb
+++ b/docs/tutorials/average_optimizers_callback.ipynb
@@ -1,395 +1,398 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Tce3stUlHN0L"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "tuOe1ymfHZPu"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MfBg1C5NB3X0"
-      },
-      "source": [
-        "# Model Averaging\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/average_optimizers_callback\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/average_optimizers_callback.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/average_optimizers_callback.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "      <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/average_optimizers_callback.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xHxb-dlhMIzW"
-      },
-      "source": [
-        "## Overview\n",
-        "\n",
-        "This notebook demonstrates how to use Moving Average Optimizer along with the Model Average Checkpoint from tensorflow addons package.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "o2UNySlpXkbl"
-      },
-      "source": [
-        "## Moving Averaging \n",
-        "\n",
-        "> The advantage of Moving Averaging is that they are less prone to rampant loss shifts or irregular data representation in the latest batch. It gives a smooothened and a more genral idea of the model training until some point.\n",
-        "\n",
-        "## Stochastic Averaging\n",
-        "\n",
-        "> Stochastic Weight Averaging converges to wider optima. By doing so, it resembles geometric ensembeling. SWA is a simple method to improve model performance when used as a wrapper around other optimizers and averaging results from different points of trajectory of the inner optimizer.\n",
-        "\n",
-        "## Model Average Checkpoint \n",
-        "\n",
-        "> `callbacks.ModelCheckpoint` doesn't give you the option to save moving average weights in the middle of training, which is why Model Average Optimizers required a custom callback. Using the ```update_weights``` parameter, ```ModelAverageCheckpoint``` allows you to:\n",
-        "1.   Assign the moving average weights to the model, and save them.\n",
-        "2.   Keep the old non-averaged weights, but the saved model uses the average weights."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MUXex9ctTuDB"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "sXEOqj5cIgyW"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -U tensorflow-addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "IqR2PQG4ZaZ0"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "import tensorflow_addons as tfa"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "4hnJ2rDpI38-"
-      },
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import os"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Iox_HZNNYLEB"
-      },
-      "source": [
-        "## Build Model "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "KtylpxOmceaC"
-      },
-      "outputs": [],
-      "source": [
-        "def create_model(opt):\n",
-        "    model = tf.keras.models.Sequential([\n",
-        "        tf.keras.layers.Flatten(),                         \n",
-        "        tf.keras.layers.Dense(64, activation='relu'),\n",
-        "        tf.keras.layers.Dense(64, activation='relu'),\n",
-        "        tf.keras.layers.Dense(10, activation='softmax')\n",
-        "    ])\n",
-        "\n",
-        "    model.compile(optimizer=opt,\n",
-        "                    loss='sparse_categorical_crossentropy',\n",
-        "                    metrics=['accuracy'])\n",
-        "\n",
-        "    return model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pwdM2pl3RSPb"
-      },
-      "source": [
-        "## Prepare Dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mMOeXVmbdilM"
-      },
-      "outputs": [],
-      "source": [
-        "#Load Fashion MNIST dataset\n",
-        "train, test = tf.keras.datasets.fashion_mnist.load_data()\n",
-        "\n",
-        "images, labels = train\n",
-        "images = images/255.0\n",
-        "labels = labels.astype(np.int32)\n",
-        "\n",
-        "fmnist_train_ds = tf.data.Dataset.from_tensor_slices((images, labels))\n",
-        "fmnist_train_ds = fmnist_train_ds.shuffle(5000).batch(32)\n",
-        "\n",
-        "test_images, test_labels = test"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "iEbhI_eajpJe"
-      },
-      "source": [
-        "We will be comparing three optimizers here:\n",
-        "\n",
-        "*   Unwrapped SGD\n",
-        "*   SGD with Moving Average\n",
-        "*   SGD with Stochastic Weight Averaging\n",
-        "\n",
-        "And see how they perform with the same model."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "_Q76K1fNk7Va"
-      },
-      "outputs": [],
-      "source": [
-        "#Optimizers \n",
-        "sgd = tf.keras.optimizers.SGD(0.01)\n",
-        "moving_avg_sgd = tfa.optimizers.MovingAverage(sgd)\n",
-        "stocastic_avg_sgd = tfa.optimizers.SWA(sgd)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nXlMX4p9qHwg"
-      },
-      "source": [
-        "Both ```MovingAverage``` and ```StocasticAverage``` optimers use ```ModelAverageCheckpoint```."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "SnvZjt34qEHY"
-      },
-      "outputs": [],
-      "source": [
-        "#Callback \n",
-        "checkpoint_path = \"./training/cp-{epoch:04d}.ckpt\"\n",
-        "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
-        "\n",
-        "cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir,\n",
-        "                                                 save_weights_only=True,\n",
-        "                                                 verbose=1)\n",
-        "avg_callback = tfa.callbacks.AverageModelCheckpoint(filepath=checkpoint_dir, \n",
-        "                                                    update_weights=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uabQmjMtRtzs"
-      },
-      "source": [
-        "## Train Model\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "SPmifETHmPix"
-      },
-      "source": [
-        "### Vanilla SGD Optimizer "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Xy8W4LYppadJ"
-      },
-      "outputs": [],
-      "source": [
-        "#Build Model\n",
-        "model = create_model(sgd)\n",
-        "\n",
-        "#Train the network\n",
-        "model.fit(fmnist_train_ds, epochs=5, callbacks=[cp_callback])"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "uU2iQ6HAZ6-E"
-      },
-      "outputs": [],
-      "source": [
-        "#Evalute results\n",
-        "model.load_weights(checkpoint_dir)\n",
-        "loss, accuracy = model.evaluate(test_images, test_labels, batch_size=32, verbose=2)\n",
-        "print(\"Loss :\", loss)\n",
-        "print(\"Accuracy :\", accuracy)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "lAvhD4unmc6W"
-      },
-      "source": [
-        "### Moving Average SGD"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "--NIjBp-mhVb"
-      },
-      "outputs": [],
-      "source": [
-        "#Build Model\n",
-        "model = create_model(moving_avg_sgd)\n",
-        "\n",
-        "#Train the network\n",
-        "model.fit(fmnist_train_ds, epochs=5, callbacks=[avg_callback])"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "zRAym9EBmnW9"
-      },
-      "outputs": [],
-      "source": [
-        "#Evalute results\n",
-        "model.load_weights(checkpoint_dir)\n",
-        "loss, accuracy = model.evaluate(test_images, test_labels, batch_size=32, verbose=2)\n",
-        "print(\"Loss :\", loss)\n",
-        "print(\"Accuracy :\", accuracy)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "K98lbU07m_Bk"
-      },
-      "source": [
-        "### Stocastic Weight Average SGD "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Ia7ALKefnXWQ"
-      },
-      "outputs": [],
-      "source": [
-        "#Build Model\n",
-        "model = create_model(stocastic_avg_sgd)\n",
-        "\n",
-        "#Train the network\n",
-        "model.fit(fmnist_train_ds, epochs=5, callbacks=[avg_callback])"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "EOT2E9NBoeHI"
-      },
-      "outputs": [],
-      "source": [
-        "#Evalute results\n",
-        "model.load_weights(checkpoint_dir)\n",
-        "loss, accuracy = model.evaluate(test_images, test_labels, batch_size=32, verbose=2)\n",
-        "print(\"Loss :\", loss)\n",
-        "print(\"Accuracy :\", accuracy)"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [
-        "Tce3stUlHN0L"
-      ],
-      "name": "average_optimizers_callback.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Tce3stUlHN0L"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors."
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "tuOe1ymfHZPu"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MfBg1C5NB3X0"
+   },
+   "source": [
+    "# Model Averaging\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/average_optimizers_callback\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/average_optimizers_callback.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/average_optimizers_callback.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "      <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/average_optimizers_callback.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "xHxb-dlhMIzW"
+   },
+   "source": [
+    "## Overview\n",
+    "\n",
+    "This notebook demonstrates how to use Moving Average Optimizer along with the Model Average Checkpoint from tensorflow addons package.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "o2UNySlpXkbl"
+   },
+   "source": [
+    "## Moving Averaging \n",
+    "\n",
+    "> The advantage of Moving Averaging is that they are less prone to rampant loss shifts or irregular data representation in the latest batch. It gives a smooothened and a more genral idea of the model training until some point.\n",
+    "\n",
+    "## Stochastic Averaging\n",
+    "\n",
+    "> Stochastic Weight Averaging converges to wider optima. By doing so, it resembles geometric ensembeling. SWA is a simple method to improve model performance when used as a wrapper around other optimizers and averaging results from different points of trajectory of the inner optimizer.\n",
+    "\n",
+    "## Model Average Checkpoint \n",
+    "\n",
+    "> `callbacks.ModelCheckpoint` doesn't give you the option to save moving average weights in the middle of training, which is why Model Average Optimizers required a custom callback. Using the ```update_weights``` parameter, ```ModelAverageCheckpoint``` allows you to:\n",
+    "1.   Assign the moving average weights to the model, and save them.\n",
+    "2.   Keep the old non-averaged weights, but the saved model uses the average weights."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MUXex9ctTuDB"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "sXEOqj5cIgyW"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U tensorflow-addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "IqR2PQG4ZaZ0"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow_addons as tfa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "4hnJ2rDpI38-"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Iox_HZNNYLEB"
+   },
+   "source": [
+    "## Build Model "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "KtylpxOmceaC"
+   },
+   "outputs": [],
+   "source": [
+    "def create_model(opt):\n",
+    "    model = tf.keras.models.Sequential(\n",
+    "        [\n",
+    "            tf.keras.layers.Flatten(),\n",
+    "            tf.keras.layers.Dense(64, activation=\"relu\"),\n",
+    "            tf.keras.layers.Dense(64, activation=\"relu\"),\n",
+    "            tf.keras.layers.Dense(10, activation=\"softmax\"),\n",
+    "        ]\n",
+    "    )\n",
+    "\n",
+    "    model.compile(\n",
+    "        optimizer=opt, loss=\"sparse_categorical_crossentropy\", metrics=[\"accuracy\"]\n",
+    "    )\n",
+    "\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pwdM2pl3RSPb"
+   },
+   "source": [
+    "## Prepare Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "mMOeXVmbdilM"
+   },
+   "outputs": [],
+   "source": [
+    "# Load Fashion MNIST dataset\n",
+    "train, test = tf.keras.datasets.fashion_mnist.load_data()\n",
+    "\n",
+    "images, labels = train\n",
+    "images = images / 255.0\n",
+    "labels = labels.astype(np.int32)\n",
+    "\n",
+    "fmnist_train_ds = tf.data.Dataset.from_tensor_slices((images, labels))\n",
+    "fmnist_train_ds = fmnist_train_ds.shuffle(5000).batch(32)\n",
+    "\n",
+    "test_images, test_labels = test"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "iEbhI_eajpJe"
+   },
+   "source": [
+    "We will be comparing three optimizers here:\n",
+    "\n",
+    "*   Unwrapped SGD\n",
+    "*   SGD with Moving Average\n",
+    "*   SGD with Stochastic Weight Averaging\n",
+    "\n",
+    "And see how they perform with the same model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_Q76K1fNk7Va"
+   },
+   "outputs": [],
+   "source": [
+    "# Optimizers\n",
+    "sgd = tf.keras.optimizers.SGD(0.01)\n",
+    "moving_avg_sgd = tfa.optimizers.MovingAverage(sgd)\n",
+    "stocastic_avg_sgd = tfa.optimizers.SWA(sgd)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "nXlMX4p9qHwg"
+   },
+   "source": [
+    "Both ```MovingAverage``` and ```StocasticAverage``` optimers use ```ModelAverageCheckpoint```."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "SnvZjt34qEHY"
+   },
+   "outputs": [],
+   "source": [
+    "# Callback\n",
+    "checkpoint_path = \"./training/cp-{epoch:04d}.ckpt\"\n",
+    "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
+    "\n",
+    "cp_callback = tf.keras.callbacks.ModelCheckpoint(\n",
+    "    filepath=checkpoint_dir, save_weights_only=True, verbose=1\n",
+    ")\n",
+    "avg_callback = tfa.callbacks.AverageModelCheckpoint(\n",
+    "    filepath=checkpoint_dir, update_weights=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uabQmjMtRtzs"
+   },
+   "source": [
+    "## Train Model\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "SPmifETHmPix"
+   },
+   "source": [
+    "### Vanilla SGD Optimizer "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Xy8W4LYppadJ"
+   },
+   "outputs": [],
+   "source": [
+    "# Build Model\n",
+    "model = create_model(sgd)\n",
+    "\n",
+    "# Train the network\n",
+    "model.fit(fmnist_train_ds, epochs=5, callbacks=[cp_callback])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "uU2iQ6HAZ6-E"
+   },
+   "outputs": [],
+   "source": [
+    "# Evalute results\n",
+    "model.load_weights(checkpoint_dir)\n",
+    "loss, accuracy = model.evaluate(test_images, test_labels, batch_size=32, verbose=2)\n",
+    "print(\"Loss :\", loss)\n",
+    "print(\"Accuracy :\", accuracy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "lAvhD4unmc6W"
+   },
+   "source": [
+    "### Moving Average SGD"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "--NIjBp-mhVb"
+   },
+   "outputs": [],
+   "source": [
+    "# Build Model\n",
+    "model = create_model(moving_avg_sgd)\n",
+    "\n",
+    "# Train the network\n",
+    "model.fit(fmnist_train_ds, epochs=5, callbacks=[avg_callback])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "zRAym9EBmnW9"
+   },
+   "outputs": [],
+   "source": [
+    "# Evalute results\n",
+    "model.load_weights(checkpoint_dir)\n",
+    "loss, accuracy = model.evaluate(test_images, test_labels, batch_size=32, verbose=2)\n",
+    "print(\"Loss :\", loss)\n",
+    "print(\"Accuracy :\", accuracy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "K98lbU07m_Bk"
+   },
+   "source": [
+    "### Stocastic Weight Average SGD "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Ia7ALKefnXWQ"
+   },
+   "outputs": [],
+   "source": [
+    "# Build Model\n",
+    "model = create_model(stocastic_avg_sgd)\n",
+    "\n",
+    "# Train the network\n",
+    "model.fit(fmnist_train_ds, epochs=5, callbacks=[avg_callback])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "EOT2E9NBoeHI"
+   },
+   "outputs": [],
+   "source": [
+    "# Evalute results\n",
+    "model.load_weights(checkpoint_dir)\n",
+    "loss, accuracy = model.evaluate(test_images, test_labels, batch_size=32, verbose=2)\n",
+    "print(\"Loss :\", loss)\n",
+    "print(\"Accuracy :\", accuracy)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "collapsed_sections": [
+    "Tce3stUlHN0L"
+   ],
+   "name": "average_optimizers_callback.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/image_ops.ipynb b/docs/tutorials/image_ops.ipynb
index d43a5bccf9..6062fb0995 100644
--- a/docs/tutorials/image_ops.ipynb
+++ b/docs/tutorials/image_ops.ipynb
@@ -1,401 +1,405 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "GWEKvPCCxJke"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "l-m8KQ-nxK5l"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "O8FuVCLYxi_l"
-      },
-      "source": [
-        "# TensorFlow Addons Image: Operations\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/image_ops\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/image_ops.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/image_ops.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "      <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/image_ops.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2a5ksOt-xsOl"
-      },
-      "source": [
-        "## Overview\n",
-        "This notebook will demonstrate how to use the some image operations in TensorFlow Addons.\n",
-        "\n",
-        "Here is the list of image operations you'll be covering in this example:\n",
-        "\n",
-        "- `tfa.image.mean_filter2d`\n",
-        "\n",
-        "- `tfa.image.rotate`\n",
-        "\n",
-        "- `tfa.image.transform`\n",
-        "\n",
-        "- `tfa.image.random_hsv_in_yiq`\n",
-        "\n",
-        "- `tfa.image.adjust_hsv_in_yiq`\n",
-        "\n",
-        "- `tfa.image.dense_image_warp`\n",
-        "\n",
-        "- `tfa.image.euclidean_dist_transform`"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "DMbjxr4PyMPF"
-      },
-      "source": [
-        "# Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "o_QTX_vHGbj7"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -U tensorflow-addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "5hVIKCrhWh4a"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "import numpy as np\n",
-        "import tensorflow_addons as tfa\n",
-        "import matplotlib.pyplot as plt"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Q6Z2rsP8yp2v"
-      },
-      "source": [
-        "# Prepare and Inspect Images"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9gbgJP10z9WO"
-      },
-      "source": [
-        "## Download the images"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "IgUsVhBQ6dSg"
-      },
-      "outputs": [],
-      "source": [
-        "img_path = tf.keras.utils.get_file('tensorflow.png','https://tensorflow.org/images/tf_logo.png')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uheQOL-y0Fj3"
-      },
-      "source": [
-        "## Inspect the images"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MFGirRRZ0Y9k"
-      },
-      "source": [
-        "### TensorFlow Icon"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NRlvNQdm1YI8"
-      },
-      "outputs": [],
-      "source": [
-        "img_raw = tf.io.read_file(img_path)\n",
-        "img = tf.io.decode_image(img_raw)\n",
-        "img = tf.image.convert_image_dtype(img, tf.float32)\n",
-        "img = tf.image.resize(img, [500,500])\n",
-        "\n",
-        "plt.title(\"TensorFlow Logo with shape {}\".format(img.shape))\n",
-        "_ = plt.imshow(img)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "clXQrFVa2nN7"
-      },
-      "source": [
-        "### Make a black and white version"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "tbaIkUCS2eNv"
-      },
-      "outputs": [],
-      "source": [
-        "bw_img = 1.0 - tf.image.rgb_to_grayscale(img)\n",
-        "\n",
-        "plt.title(\"Mask image with shape {}\".format(bw_img.shape))\n",
-        "_ = plt.imshow(bw_img[...,0], cmap='gray')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UwqfpOm--vV2"
-      },
-      "source": [
-        "# Play with tfa.image"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jIa5HnomPds3"
-      },
-      "source": [
-        "## Mean filtering\n",
-        "Mean filtering is a filtering technique, which is often used to remove noise from an image or signal. The idea is to run through the image pixel by pixel and replacing it with the average values of neighboring pixels."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "SutWnbRoHl6i"
-      },
-      "outputs": [],
-      "source": [
-        "mean = tfa.image.mean_filter2d(img, filter_shape=11)\n",
-        "_ = plt.imshow(mean)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Mp6cU7I0-r2h"
-      },
-      "source": [
-        "## Rotate\n",
-        "This operation rotates the given image by the angle (in radians) input by the user.  "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9kxUES9sM8Jl"
-      },
-      "outputs": [],
-      "source": [
-        "rotate = tfa.image.rotate(img, tf.constant(np.pi/8))\n",
-        "_ = plt.imshow(rotate)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "WjMdSDKlBcPh"
-      },
-      "source": [
-        "## Transform\n",
-        "This operation transforms the given image on the basis of the transform vector given by the user. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "HTh1Qpps8Rg5"
-      },
-      "outputs": [],
-      "source": [
-        "transform = tfa.image.transform(img, [1.0, 1.0, -250, 0.0, 1.0, 0.0, 0.0, 0.0])\n",
-        "_ = plt.imshow(transform)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "O79BrK-bC8oh"
-      },
-      "source": [
-        "## Random HSV in YIQ\n",
-        "This operation changes color scale of a given RGB image to YIQ but here delta hue and saturation values are picked randomly from the given range."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "zZBI-9XvBSuh"
-      },
-      "outputs": [],
-      "source": [
-        "delta = 0.5\n",
-        "lower_saturation = 0.1\n",
-        "upper_saturation = 0.9\n",
-        "lower_value = 0.2\n",
-        "upper_value = 0.8\n",
-        "rand_hsvinyiq = tfa.image.random_hsv_in_yiq(img, delta, lower_saturation, upper_saturation, lower_value, upper_value)\n",
-        "_ = plt.imshow(rand_hsvinyiq)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ruyvVnmCDBgj"
-      },
-      "source": [
-        "## Adjust HSV in YIQ\n",
-        "This operation changes color scale of a given RGB image to YIQ but here instead of choosing randomly, delta hue and saturation values are inputs form the user."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "vbCdwGtYChnQ"
-      },
-      "outputs": [],
-      "source": [
-        "delta = 0.5\n",
-        "saturation = 0.3\n",
-        "value = 0.6\n",
-        "adj_hsvinyiq = tfa.image.adjust_hsv_in_yiq(img, delta, saturation, value)\n",
-        "_ = plt.imshow(adj_hsvinyiq)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fdbCDYJkG8Gv"
-      },
-      "source": [
-        "## Dense Image Warp\n",
-        "This operation is for non-linear warp of any image specified by the flow field of the offset vector (here used random values for example). "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "dG557eQDDtSK"
-      },
-      "outputs": [],
-      "source": [
-        "input_img = tf.image.convert_image_dtype(tf.expand_dims(img, 0), tf.dtypes.float32)\n",
-        "\n",
-        "flow_shape = [1, input_img.shape[1], input_img.shape[2], 2]\n",
-        "init_flows = np.float32(np.random.normal(size=flow_shape) * 2.0)\n",
-        "dense_img_warp = tfa.image.dense_image_warp(input_img, init_flows)\n",
-        "dense_img_warp = tf.squeeze(dense_img_warp, 0)\n",
-        "_ = plt.imshow(dense_img_warp)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FcLMnSKYPcjA"
-      },
-      "source": [
-        "## Euclidian Distance Transform\n",
-        "This operation updates the pixel value with the euclidian distance from the foreground pixel to the background one.\n",
-        "* Note : It takes only binary image and results in transformed image. If a different image is given it results in a image with single value"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "-OMh6oeRQaYQ"
-      },
-      "outputs": [],
-      "source": [
-        "gray = tf.image.convert_image_dtype(bw_img,tf.uint8)\n",
-        "# The op expects a batch of images, so add a batch dimension\n",
-        "gray = tf.expand_dims(gray, 0)\n",
-        "eucid = tfa.image.euclidean_dist_transform(gray)\n",
-        "eucid = tf.squeeze(eucid, (0, -1))\n",
-        "_ = plt.imshow(eucid, cmap='gray')"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "image_ops.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "GWEKvPCCxJke"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors."
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "l-m8KQ-nxK5l"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "O8FuVCLYxi_l"
+   },
+   "source": [
+    "# TensorFlow Addons Image: Operations\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/image_ops\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/image_ops.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/image_ops.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "      <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/image_ops.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2a5ksOt-xsOl"
+   },
+   "source": [
+    "## Overview\n",
+    "This notebook will demonstrate how to use the some image operations in TensorFlow Addons.\n",
+    "\n",
+    "Here is the list of image operations you'll be covering in this example:\n",
+    "\n",
+    "- `tfa.image.mean_filter2d`\n",
+    "\n",
+    "- `tfa.image.rotate`\n",
+    "\n",
+    "- `tfa.image.transform`\n",
+    "\n",
+    "- `tfa.image.random_hsv_in_yiq`\n",
+    "\n",
+    "- `tfa.image.adjust_hsv_in_yiq`\n",
+    "\n",
+    "- `tfa.image.dense_image_warp`\n",
+    "\n",
+    "- `tfa.image.euclidean_dist_transform`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "DMbjxr4PyMPF"
+   },
+   "source": [
+    "# Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "o_QTX_vHGbj7"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U tensorflow-addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "5hVIKCrhWh4a"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import numpy as np\n",
+    "import tensorflow_addons as tfa\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Q6Z2rsP8yp2v"
+   },
+   "source": [
+    "# Prepare and Inspect Images"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "9gbgJP10z9WO"
+   },
+   "source": [
+    "## Download the images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "IgUsVhBQ6dSg"
+   },
+   "outputs": [],
+   "source": [
+    "img_path = tf.keras.utils.get_file(\n",
+    "    \"tensorflow.png\", \"https://tensorflow.org/images/tf_logo.png\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uheQOL-y0Fj3"
+   },
+   "source": [
+    "## Inspect the images"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MFGirRRZ0Y9k"
+   },
+   "source": [
+    "### TensorFlow Icon"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NRlvNQdm1YI8"
+   },
+   "outputs": [],
+   "source": [
+    "img_raw = tf.io.read_file(img_path)\n",
+    "img = tf.io.decode_image(img_raw)\n",
+    "img = tf.image.convert_image_dtype(img, tf.float32)\n",
+    "img = tf.image.resize(img, [500, 500])\n",
+    "\n",
+    "plt.title(\"TensorFlow Logo with shape {}\".format(img.shape))\n",
+    "_ = plt.imshow(img)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "clXQrFVa2nN7"
+   },
+   "source": [
+    "### Make a black and white version"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "tbaIkUCS2eNv"
+   },
+   "outputs": [],
+   "source": [
+    "bw_img = 1.0 - tf.image.rgb_to_grayscale(img)\n",
+    "\n",
+    "plt.title(\"Mask image with shape {}\".format(bw_img.shape))\n",
+    "_ = plt.imshow(bw_img[..., 0], cmap=\"gray\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "UwqfpOm--vV2"
+   },
+   "source": [
+    "# Play with tfa.image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "jIa5HnomPds3"
+   },
+   "source": [
+    "## Mean filtering\n",
+    "Mean filtering is a filtering technique, which is often used to remove noise from an image or signal. The idea is to run through the image pixel by pixel and replacing it with the average values of neighboring pixels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "SutWnbRoHl6i"
+   },
+   "outputs": [],
+   "source": [
+    "mean = tfa.image.mean_filter2d(img, filter_shape=11)\n",
+    "_ = plt.imshow(mean)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Mp6cU7I0-r2h"
+   },
+   "source": [
+    "## Rotate\n",
+    "This operation rotates the given image by the angle (in radians) input by the user.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "9kxUES9sM8Jl"
+   },
+   "outputs": [],
+   "source": [
+    "rotate = tfa.image.rotate(img, tf.constant(np.pi / 8))\n",
+    "_ = plt.imshow(rotate)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WjMdSDKlBcPh"
+   },
+   "source": [
+    "## Transform\n",
+    "This operation transforms the given image on the basis of the transform vector given by the user. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "HTh1Qpps8Rg5"
+   },
+   "outputs": [],
+   "source": [
+    "transform = tfa.image.transform(img, [1.0, 1.0, -250, 0.0, 1.0, 0.0, 0.0, 0.0])\n",
+    "_ = plt.imshow(transform)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "O79BrK-bC8oh"
+   },
+   "source": [
+    "## Random HSV in YIQ\n",
+    "This operation changes color scale of a given RGB image to YIQ but here delta hue and saturation values are picked randomly from the given range."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "zZBI-9XvBSuh"
+   },
+   "outputs": [],
+   "source": [
+    "delta = 0.5\n",
+    "lower_saturation = 0.1\n",
+    "upper_saturation = 0.9\n",
+    "lower_value = 0.2\n",
+    "upper_value = 0.8\n",
+    "rand_hsvinyiq = tfa.image.random_hsv_in_yiq(\n",
+    "    img, delta, lower_saturation, upper_saturation, lower_value, upper_value\n",
+    ")\n",
+    "_ = plt.imshow(rand_hsvinyiq)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ruyvVnmCDBgj"
+   },
+   "source": [
+    "## Adjust HSV in YIQ\n",
+    "This operation changes color scale of a given RGB image to YIQ but here instead of choosing randomly, delta hue and saturation values are inputs form the user."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "vbCdwGtYChnQ"
+   },
+   "outputs": [],
+   "source": [
+    "delta = 0.5\n",
+    "saturation = 0.3\n",
+    "value = 0.6\n",
+    "adj_hsvinyiq = tfa.image.adjust_hsv_in_yiq(img, delta, saturation, value)\n",
+    "_ = plt.imshow(adj_hsvinyiq)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "fdbCDYJkG8Gv"
+   },
+   "source": [
+    "## Dense Image Warp\n",
+    "This operation is for non-linear warp of any image specified by the flow field of the offset vector (here used random values for example). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "dG557eQDDtSK"
+   },
+   "outputs": [],
+   "source": [
+    "input_img = tf.image.convert_image_dtype(tf.expand_dims(img, 0), tf.dtypes.float32)\n",
+    "\n",
+    "flow_shape = [1, input_img.shape[1], input_img.shape[2], 2]\n",
+    "init_flows = np.float32(np.random.normal(size=flow_shape) * 2.0)\n",
+    "dense_img_warp = tfa.image.dense_image_warp(input_img, init_flows)\n",
+    "dense_img_warp = tf.squeeze(dense_img_warp, 0)\n",
+    "_ = plt.imshow(dense_img_warp)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "FcLMnSKYPcjA"
+   },
+   "source": [
+    "## Euclidian Distance Transform\n",
+    "This operation updates the pixel value with the euclidian distance from the foreground pixel to the background one.\n",
+    "* Note : It takes only binary image and results in transformed image. If a different image is given it results in a image with single value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "-OMh6oeRQaYQ"
+   },
+   "outputs": [],
+   "source": [
+    "gray = tf.image.convert_image_dtype(bw_img, tf.uint8)\n",
+    "# The op expects a batch of images, so add a batch dimension\n",
+    "gray = tf.expand_dims(gray, 0)\n",
+    "eucid = tfa.image.euclidean_dist_transform(gray)\n",
+    "eucid = tf.squeeze(eucid, (0, -1))\n",
+    "_ = plt.imshow(eucid, cmap=\"gray\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "image_ops.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/layers_normalizations.ipynb b/docs/tutorials/layers_normalizations.ipynb
index 0788784667..3e984c3fe5 100644
--- a/docs/tutorials/layers_normalizations.ipynb
+++ b/docs/tutorials/layers_normalizations.ipynb
@@ -1,324 +1,338 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "wFPyjGqMQ82Q"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "aNZ7aEDyQIYU"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uMOmzhPEQh7b"
-      },
-      "source": [
-        "# Normalizations\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/layers_normalizations\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/layers_normalizations.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/layers_normalizations.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/layers_normalizations.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "cthm5dovQMJl"
-      },
-      "source": [
-        "## Overview\n",
-        "This notebook gives a brief introduction into the [normalization layers](https://github.com/tensorflow/addons/blob/master/tensorflow_addons/layers/normalizations.py) of TensorFlow. Currently supported layers are:\n",
-        "* **Group Normalization** (TensorFlow Addons)\n",
-        "* **Instance Normalization** (TensorFlow Addons)\n",
-        "* **Layer Normalization** (TensorFlow Core)\n",
-        "\n",
-        "The basic idea behind these layers is to normalize the output of an activation layer to improve the convergence during training. In contrast to [batch normalization](https://keras.io/layers/normalization/) these normalizations do not work on batches, instead they normalize the activations of a single sample, making them suitable for recurrent neural networks as well. \n",
-        "\n",
-        "Typically the normalization is performed by calculating the mean and the standard deviation of a subgroup in your input tensor. It is also possible to apply a scale and an offset factor to this as well.\n",
-        "\n",
-        "\n",
-        "$y_{i} = \\frac{\\gamma ( x_{i} - \\mu )}{\\sigma }+ \\beta$\n",
-        "\n",
-        "$ y$ : Output\n",
-        "\n",
-        "$x$ : Input\n",
-        "\n",
-        "$\\gamma$ : Scale factor\n",
-        "\n",
-        "$\\mu$: mean\n",
-        "\n",
-        "$\\sigma$: standard deviation\n",
-        "\n",
-        "$\\beta$: Offset factor\n",
-        "\n",
-        "\n",
-        "The following image demonstrates the difference between these techniques. Each subplot shows an input tensor, with N as the batch axis, C as the channel axis, and (H, W)\n",
-        "as the spatial axes (Height and Width of a picture for example). The pixels in blue are normalized by the same mean and variance, computed by aggregating the values of these pixels.\n",
-        "\n",
-        "![](https://github.com/shaohua0116/Group-Normalization-Tensorflow/raw/master/figure/gn.png)\n",
-        "\n",
-        "Source: (https://arxiv.org/pdf/1803.08494.pdf)\n",
-        "\n",
-        "The weights gamma and beta are trainable in all normalization layers to compensate for the possible lost of representational ability. You can activate these factors by setting the `center` or the `scale` flag to `True`. Of course you can use `initializers`, `constraints` and `regularizer` for `beta` and `gamma` to tune these values during the training process. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "I2XlcXf5WBHb"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "kTlbneoEUKrD"
-      },
-      "source": [
-        "### Install Tensorflow 2.0 and Tensorflow-Addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "_ZQGY_ALnirQ"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -U tensorflow-addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "7aGgPZG_WBHg"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "import tensorflow_addons as tfa"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "u82Gz_gOUPDZ"
-      },
-      "source": [
-        "### Preparing Dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "3wso9oidUZZQ"
-      },
-      "outputs": [],
-      "source": [
-        "mnist = tf.keras.datasets.mnist\n",
-        "\n",
-        "(x_train, y_train),(x_test, y_test) = mnist.load_data()\n",
-        "x_train, x_test = x_train / 255.0, x_test / 255.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UTQH56j89POZ"
-      },
-      "source": [
-        "## Group Normalization Tutorial \n",
-        "\n",
-        "### Introduction\n",
-        "Group Normalization(GN) divides the channels of your inputs into smaller sub groups and normalizes these values based on their mean and variance. Since GN works on a single example this technique is batchsize independent. \n",
-        "\n",
-        "GN experimentally scored closed to batch normalization in image classification tasks. It can be beneficial to use GN instead of Batch Normalization in case your overall batch_size is low, which would lead to bad performance of batch normalization  \n",
-        "\n",
-        "###Example\n",
-        "Splitting 10 channels after a Conv2D layer into 5 subgroups in a standard \"channels last\" setting:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "aIGjLwYWAm0v"
-      },
-      "outputs": [],
-      "source": [
-        "model = tf.keras.models.Sequential([\n",
-        "  # Reshape into \"channels last\" setup.\n",
-        "  tf.keras.layers.Reshape((28,28,1), input_shape=(28,28)),\n",
-        "  tf.keras.layers.Conv2D(filters=10, kernel_size=(3,3),data_format=\"channels_last\"),\n",
-        "  # Groupnorm Layer\n",
-        "  tfa.layers.GroupNormalization(groups=5, axis=3),\n",
-        "  tf.keras.layers.Flatten(),\n",
-        "  tf.keras.layers.Dense(128, activation='relu'),\n",
-        "  tf.keras.layers.Dropout(0.2),\n",
-        "  tf.keras.layers.Dense(10, activation='softmax')\n",
-        "])\n",
-        "\n",
-        "model.compile(optimizer='adam',\n",
-        "              loss='sparse_categorical_crossentropy',\n",
-        "              metrics=['accuracy'])\n",
-        "model.fit(x_test, y_test)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "QMwUfJUib3ka"
-      },
-      "source": [
-        "## Instance Normalization Tutorial\n",
-        "### Introduction\n",
-        "Instance Normalization is special case of group normalization where the group size is the same size as the channel size (or the axis size).\n",
-        "\n",
-        "Experimental results show that instance normalization performs well on style transfer when replacing batch normalization. Recently, instance normalization has also been used as a replacement for batch normalization in GANs.\n",
-        "\n",
-        "### Example\n",
-        "Applying InstanceNormalization after a Conv2D Layer and using a uniformed initialized scale and offset factor."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "6sLVv-C8f6Kf"
-      },
-      "outputs": [],
-      "source": [
-        "model = tf.keras.models.Sequential([\n",
-        "  # Reshape into \"channels last\" setup.\n",
-        "  tf.keras.layers.Reshape((28,28,1), input_shape=(28,28)),\n",
-        "  tf.keras.layers.Conv2D(filters=10, kernel_size=(3,3),data_format=\"channels_last\"),\n",
-        "  # LayerNorm Layer\n",
-        "  tfa.layers.InstanceNormalization(axis=3, \n",
-        "                                   center=True, \n",
-        "                                   scale=True,\n",
-        "                                   beta_initializer=\"random_uniform\",\n",
-        "                                   gamma_initializer=\"random_uniform\"),\n",
-        "  tf.keras.layers.Flatten(),\n",
-        "  tf.keras.layers.Dense(128, activation='relu'),\n",
-        "  tf.keras.layers.Dropout(0.2),\n",
-        "  tf.keras.layers.Dense(10, activation='softmax')\n",
-        "])\n",
-        "\n",
-        "model.compile(optimizer='adam',\n",
-        "              loss='sparse_categorical_crossentropy',\n",
-        "              metrics=['accuracy'])\n",
-        "model.fit(x_test, y_test)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qYdnEocRUCll"
-      },
-      "source": [
-        "## Layer Normalization Tutorial\n",
-        "### Introduction\n",
-        "Layer Normalization is special case of group normalization where the group size is 1. The mean and standard deviation is calculated from all activations of a single sample.\n",
-        "\n",
-        "Experimental results show that Layer normalization is well suited for Recurrent Neural Networks, since it works batchsize independently.\n",
-        "\n",
-        "### Example\n",
-        "\n",
-        "Applying Layernormalization after a Conv2D Layer and using a scale and offset factor. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Fh-Pp_e5UB54"
-      },
-      "outputs": [],
-      "source": [
-        "model = tf.keras.models.Sequential([\n",
-        "  # Reshape into \"channels last\" setup.\n",
-        "  tf.keras.layers.Reshape((28,28,1), input_shape=(28,28)),\n",
-        "  tf.keras.layers.Conv2D(filters=10, kernel_size=(3,3),data_format=\"channels_last\"),\n",
-        "  # LayerNorm Layer\n",
-        "  tf.keras.layers.LayerNormalization(axis=3 , center=True , scale=True),\n",
-        "  tf.keras.layers.Flatten(),\n",
-        "  tf.keras.layers.Dense(128, activation='relu'),\n",
-        "  tf.keras.layers.Dropout(0.2),\n",
-        "  tf.keras.layers.Dense(10, activation='softmax')\n",
-        "])\n",
-        "\n",
-        "model.compile(optimizer='adam',\n",
-        "              loss='sparse_categorical_crossentropy',\n",
-        "              metrics=['accuracy'])\n",
-        "model.fit(x_test, y_test)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "shvGfnB0WpQQ"
-      },
-      "source": [
-        "## Literature\n",
-        "[Layer norm](https://arxiv.org/pdf/1607.06450.pdf)\n",
-        "\n",
-        "[Instance norm](https://arxiv.org/pdf/1607.08022.pdf)\n",
-        "\n",
-        "[Group Norm](https://arxiv.org/pdf/1803.08494.pdf)"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "layers_normalizations.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wFPyjGqMQ82Q"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors.\n"
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "aNZ7aEDyQIYU"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uMOmzhPEQh7b"
+   },
+   "source": [
+    "# Normalizations\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/layers_normalizations\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/layers_normalizations.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/layers_normalizations.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/layers_normalizations.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "cthm5dovQMJl"
+   },
+   "source": [
+    "## Overview\n",
+    "This notebook gives a brief introduction into the [normalization layers](https://github.com/tensorflow/addons/blob/master/tensorflow_addons/layers/normalizations.py) of TensorFlow. Currently supported layers are:\n",
+    "* **Group Normalization** (TensorFlow Addons)\n",
+    "* **Instance Normalization** (TensorFlow Addons)\n",
+    "* **Layer Normalization** (TensorFlow Core)\n",
+    "\n",
+    "The basic idea behind these layers is to normalize the output of an activation layer to improve the convergence during training. In contrast to [batch normalization](https://keras.io/layers/normalization/) these normalizations do not work on batches, instead they normalize the activations of a single sample, making them suitable for recurrent neural networks as well. \n",
+    "\n",
+    "Typically the normalization is performed by calculating the mean and the standard deviation of a subgroup in your input tensor. It is also possible to apply a scale and an offset factor to this as well.\n",
+    "\n",
+    "\n",
+    "$y_{i} = \\frac{\\gamma ( x_{i} - \\mu )}{\\sigma }+ \\beta$\n",
+    "\n",
+    "$ y$ : Output\n",
+    "\n",
+    "$x$ : Input\n",
+    "\n",
+    "$\\gamma$ : Scale factor\n",
+    "\n",
+    "$\\mu$: mean\n",
+    "\n",
+    "$\\sigma$: standard deviation\n",
+    "\n",
+    "$\\beta$: Offset factor\n",
+    "\n",
+    "\n",
+    "The following image demonstrates the difference between these techniques. Each subplot shows an input tensor, with N as the batch axis, C as the channel axis, and (H, W)\n",
+    "as the spatial axes (Height and Width of a picture for example). The pixels in blue are normalized by the same mean and variance, computed by aggregating the values of these pixels.\n",
+    "\n",
+    "![](https://github.com/shaohua0116/Group-Normalization-Tensorflow/raw/master/figure/gn.png)\n",
+    "\n",
+    "Source: (https://arxiv.org/pdf/1803.08494.pdf)\n",
+    "\n",
+    "The weights gamma and beta are trainable in all normalization layers to compensate for the possible lost of representational ability. You can activate these factors by setting the `center` or the `scale` flag to `True`. Of course you can use `initializers`, `constraints` and `regularizer` for `beta` and `gamma` to tune these values during the training process. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "I2XlcXf5WBHb"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kTlbneoEUKrD"
+   },
+   "source": [
+    "### Install Tensorflow 2.0 and Tensorflow-Addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_ZQGY_ALnirQ"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U tensorflow-addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "7aGgPZG_WBHg"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow_addons as tfa"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "u82Gz_gOUPDZ"
+   },
+   "source": [
+    "### Preparing Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "3wso9oidUZZQ"
+   },
+   "outputs": [],
+   "source": [
+    "mnist = tf.keras.datasets.mnist\n",
+    "\n",
+    "(x_train, y_train), (x_test, y_test) = mnist.load_data()\n",
+    "x_train, x_test = x_train / 255.0, x_test / 255.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "UTQH56j89POZ"
+   },
+   "source": [
+    "## Group Normalization Tutorial \n",
+    "\n",
+    "### Introduction\n",
+    "Group Normalization(GN) divides the channels of your inputs into smaller sub groups and normalizes these values based on their mean and variance. Since GN works on a single example this technique is batchsize independent. \n",
+    "\n",
+    "GN experimentally scored closed to batch normalization in image classification tasks. It can be beneficial to use GN instead of Batch Normalization in case your overall batch_size is low, which would lead to bad performance of batch normalization  \n",
+    "\n",
+    "###Example\n",
+    "Splitting 10 channels after a Conv2D layer into 5 subgroups in a standard \"channels last\" setting:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "aIGjLwYWAm0v"
+   },
+   "outputs": [],
+   "source": [
+    "model = tf.keras.models.Sequential(\n",
+    "    [\n",
+    "        # Reshape into \"channels last\" setup.\n",
+    "        tf.keras.layers.Reshape((28, 28, 1), input_shape=(28, 28)),\n",
+    "        tf.keras.layers.Conv2D(\n",
+    "            filters=10, kernel_size=(3, 3), data_format=\"channels_last\"\n",
+    "        ),\n",
+    "        # Groupnorm Layer\n",
+    "        tfa.layers.GroupNormalization(groups=5, axis=3),\n",
+    "        tf.keras.layers.Flatten(),\n",
+    "        tf.keras.layers.Dense(128, activation=\"relu\"),\n",
+    "        tf.keras.layers.Dropout(0.2),\n",
+    "        tf.keras.layers.Dense(10, activation=\"softmax\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "model.compile(\n",
+    "    optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", metrics=[\"accuracy\"]\n",
+    ")\n",
+    "model.fit(x_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QMwUfJUib3ka"
+   },
+   "source": [
+    "## Instance Normalization Tutorial\n",
+    "### Introduction\n",
+    "Instance Normalization is special case of group normalization where the group size is the same size as the channel size (or the axis size).\n",
+    "\n",
+    "Experimental results show that instance normalization performs well on style transfer when replacing batch normalization. Recently, instance normalization has also been used as a replacement for batch normalization in GANs.\n",
+    "\n",
+    "### Example\n",
+    "Applying InstanceNormalization after a Conv2D Layer and using a uniformed initialized scale and offset factor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "6sLVv-C8f6Kf"
+   },
+   "outputs": [],
+   "source": [
+    "model = tf.keras.models.Sequential(\n",
+    "    [\n",
+    "        # Reshape into \"channels last\" setup.\n",
+    "        tf.keras.layers.Reshape((28, 28, 1), input_shape=(28, 28)),\n",
+    "        tf.keras.layers.Conv2D(\n",
+    "            filters=10, kernel_size=(3, 3), data_format=\"channels_last\"\n",
+    "        ),\n",
+    "        # LayerNorm Layer\n",
+    "        tfa.layers.InstanceNormalization(\n",
+    "            axis=3,\n",
+    "            center=True,\n",
+    "            scale=True,\n",
+    "            beta_initializer=\"random_uniform\",\n",
+    "            gamma_initializer=\"random_uniform\",\n",
+    "        ),\n",
+    "        tf.keras.layers.Flatten(),\n",
+    "        tf.keras.layers.Dense(128, activation=\"relu\"),\n",
+    "        tf.keras.layers.Dropout(0.2),\n",
+    "        tf.keras.layers.Dense(10, activation=\"softmax\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "model.compile(\n",
+    "    optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", metrics=[\"accuracy\"]\n",
+    ")\n",
+    "model.fit(x_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qYdnEocRUCll"
+   },
+   "source": [
+    "## Layer Normalization Tutorial\n",
+    "### Introduction\n",
+    "Layer Normalization is special case of group normalization where the group size is 1. The mean and standard deviation is calculated from all activations of a single sample.\n",
+    "\n",
+    "Experimental results show that Layer normalization is well suited for Recurrent Neural Networks, since it works batchsize independently.\n",
+    "\n",
+    "### Example\n",
+    "\n",
+    "Applying Layernormalization after a Conv2D Layer and using a scale and offset factor. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Fh-Pp_e5UB54"
+   },
+   "outputs": [],
+   "source": [
+    "model = tf.keras.models.Sequential(\n",
+    "    [\n",
+    "        # Reshape into \"channels last\" setup.\n",
+    "        tf.keras.layers.Reshape((28, 28, 1), input_shape=(28, 28)),\n",
+    "        tf.keras.layers.Conv2D(\n",
+    "            filters=10, kernel_size=(3, 3), data_format=\"channels_last\"\n",
+    "        ),\n",
+    "        # LayerNorm Layer\n",
+    "        tf.keras.layers.LayerNormalization(axis=3, center=True, scale=True),\n",
+    "        tf.keras.layers.Flatten(),\n",
+    "        tf.keras.layers.Dense(128, activation=\"relu\"),\n",
+    "        tf.keras.layers.Dropout(0.2),\n",
+    "        tf.keras.layers.Dense(10, activation=\"softmax\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "model.compile(\n",
+    "    optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", metrics=[\"accuracy\"]\n",
+    ")\n",
+    "model.fit(x_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "shvGfnB0WpQQ"
+   },
+   "source": [
+    "## Literature\n",
+    "[Layer norm](https://arxiv.org/pdf/1607.06450.pdf)\n",
+    "\n",
+    "[Instance norm](https://arxiv.org/pdf/1607.08022.pdf)\n",
+    "\n",
+    "[Group Norm](https://arxiv.org/pdf/1803.08494.pdf)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "collapsed_sections": [],
+   "name": "layers_normalizations.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/layers_weightnormalization.ipynb b/docs/tutorials/layers_weightnormalization.ipynb
index f1884cf8e7..6a19b17001 100644
--- a/docs/tutorials/layers_weightnormalization.ipynb
+++ b/docs/tutorials/layers_weightnormalization.ipynb
@@ -1,311 +1,330 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Tce3stUlHN0L"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "tuOe1ymfHZPu"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MfBg1C5NB3X0"
-      },
-      "source": [
-        "# TensorFlow Addons Layers: WeightNormalization\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/layers_weightnormalization\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/layers_weightnormalization.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/layers_weightnormalization.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/layers_weightnormalization.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xHxb-dlhMIzW"
-      },
-      "source": [
-        "## Overview\n",
-        "\n",
-        "This notebook will demonstrate how to use the Weight Normalization layer and how it can improve convergence.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KR01t9v_fxbT"
-      },
-      "source": [
-        "# WeightNormalization\n",
-        "\n",
-        "A Simple Reparameterization to Accelerate Training of Deep Neural Networks:\n",
-        "\n",
-        "Tim Salimans, Diederik P. Kingma (2016)\n",
-        "\n",
-        "> By reparameterizing the weights in this way you improve the conditioning of the optimization problem and speed up convergence of stochastic gradient descent. Our reparameterization is inspired by batch normalization but does not introduce any dependencies between the examples in a minibatch. This means that our method can also be applied successfully to recurrent models such as LSTMs and to noise-sensitive applications such as deep reinforcement learning or generative models, for which batch normalization is less well suited. Although our method is much simpler, it still provides much of the speed-up of full batch normalization. In addition, the computational overhead of our method is lower, permitting more optimization steps to be taken in the same amount of time.\n",
-        "\n",
-        "> https://arxiv.org/abs/1602.07868 \n",
-        "\n",
-        "<img src=\"https://raw.githubusercontent.com/seanpmorgan/tf-weightnorm/master/static/wrapped-graph.png\" width=\"80%\"><br><br>\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MUXex9ctTuDB"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "CyWHXw9mQ6mp"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -U tensorflow-addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "OywLbs7EXiE_"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "import tensorflow_addons as tfa"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "KQMhhq1qXiFF"
-      },
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "from matplotlib import pyplot as plt"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ULWHqMAnTVZD"
-      },
-      "outputs": [],
-      "source": [
-        "# Hyper Parameters\n",
-        "batch_size = 32\n",
-        "epochs = 10\n",
-        "num_classes=10"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fhM0ieDpSnKh"
-      },
-      "source": [
-        "## Build Models"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "7XZXnBYgRPSk"
-      },
-      "outputs": [],
-      "source": [
-        "# Standard ConvNet\n",
-        "reg_model = tf.keras.Sequential([\n",
-        "    tf.keras.layers.Conv2D(6, 5, activation='relu'),\n",
-        "    tf.keras.layers.MaxPooling2D(2, 2),\n",
-        "    tf.keras.layers.Conv2D(16, 5, activation='relu'),\n",
-        "    tf.keras.layers.MaxPooling2D(2, 2),\n",
-        "    tf.keras.layers.Flatten(),\n",
-        "    tf.keras.layers.Dense(120, activation='relu'),\n",
-        "    tf.keras.layers.Dense(84, activation='relu'),\n",
-        "    tf.keras.layers.Dense(num_classes, activation='softmax'),\n",
-        "])"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "UZd6V90eR4Gm"
-      },
-      "outputs": [],
-      "source": [
-        "# WeightNorm ConvNet\n",
-        "wn_model = tf.keras.Sequential([\n",
-        "    tfa.layers.WeightNormalization(tf.keras.layers.Conv2D(6, 5, activation='relu')),\n",
-        "    tf.keras.layers.MaxPooling2D(2, 2),\n",
-        "    tfa.layers.WeightNormalization(tf.keras.layers.Conv2D(16, 5, activation='relu')),\n",
-        "    tf.keras.layers.MaxPooling2D(2, 2),\n",
-        "    tf.keras.layers.Flatten(),\n",
-        "    tfa.layers.WeightNormalization(tf.keras.layers.Dense(120, activation='relu')),\n",
-        "    tfa.layers.WeightNormalization(tf.keras.layers.Dense(84, activation='relu')),\n",
-        "    tfa.layers.WeightNormalization(tf.keras.layers.Dense(num_classes, activation='softmax')),\n",
-        "])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "AA5dti8AS2Y7"
-      },
-      "source": [
-        "## Load Data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "L8Isjc7W8MEn"
-      },
-      "outputs": [],
-      "source": [
-        "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()\n",
-        "\n",
-        "# Convert class vectors to binary class matrices.\n",
-        "y_train = tf.keras.utils.to_categorical(y_train, num_classes)\n",
-        "y_test = tf.keras.utils.to_categorical(y_test, num_classes)\n",
-        "\n",
-        "x_train = x_train.astype('float32')\n",
-        "x_test = x_test.astype('float32')\n",
-        "x_train /= 255\n",
-        "x_test /= 255"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "cH1CG9E7S34C"
-      },
-      "source": [
-        "## Train Models"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "EvNKxfaI7vSm"
-      },
-      "outputs": [],
-      "source": [
-        "reg_model.compile(optimizer='adam', \n",
-        "                  loss='categorical_crossentropy',\n",
-        "                  metrics=['accuracy'])\n",
-        "\n",
-        "reg_history = reg_model.fit(x_train, y_train,\n",
-        "                            batch_size=batch_size,\n",
-        "                            epochs=epochs,\n",
-        "                            validation_data=(x_test, y_test),\n",
-        "                            shuffle=True)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "esmMh-5g7wmp"
-      },
-      "outputs": [],
-      "source": [
-        "wn_model.compile(optimizer='adam', \n",
-        "                 loss='categorical_crossentropy',\n",
-        "                 metrics=['accuracy'])\n",
-        "\n",
-        "wn_history = wn_model.fit(x_train, y_train,\n",
-        "                          batch_size=batch_size,\n",
-        "                          epochs=epochs,\n",
-        "                          validation_data=(x_test, y_test),\n",
-        "                          shuffle=True)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "yujf2YRbwX55"
-      },
-      "outputs": [],
-      "source": [
-        "reg_accuracy = reg_history.history['accuracy']\n",
-        "wn_accuracy = wn_history.history['accuracy']\n",
-        "\n",
-        "plt.plot(np.linspace(0, epochs,  epochs), reg_accuracy,\n",
-        "             color='red', label='Regular ConvNet')\n",
-        "\n",
-        "plt.plot(np.linspace(0, epochs, epochs), wn_accuracy,\n",
-        "         color='blue', label='WeightNorm ConvNet')\n",
-        "\n",
-        "plt.title('WeightNorm Accuracy Comparison')\n",
-        "plt.legend()\n",
-        "plt.grid(True)\n",
-        "plt.show()"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "layers_weightnormalization.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Tce3stUlHN0L"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors.\n"
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "tuOe1ymfHZPu"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MfBg1C5NB3X0"
+   },
+   "source": [
+    "# TensorFlow Addons Layers: WeightNormalization\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/layers_weightnormalization\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/layers_weightnormalization.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/layers_weightnormalization.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/layers_weightnormalization.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "xHxb-dlhMIzW"
+   },
+   "source": [
+    "## Overview\n",
+    "\n",
+    "This notebook will demonstrate how to use the Weight Normalization layer and how it can improve convergence.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "KR01t9v_fxbT"
+   },
+   "source": [
+    "# WeightNormalization\n",
+    "\n",
+    "A Simple Reparameterization to Accelerate Training of Deep Neural Networks:\n",
+    "\n",
+    "Tim Salimans, Diederik P. Kingma (2016)\n",
+    "\n",
+    "> By reparameterizing the weights in this way you improve the conditioning of the optimization problem and speed up convergence of stochastic gradient descent. Our reparameterization is inspired by batch normalization but does not introduce any dependencies between the examples in a minibatch. This means that our method can also be applied successfully to recurrent models such as LSTMs and to noise-sensitive applications such as deep reinforcement learning or generative models, for which batch normalization is less well suited. Although our method is much simpler, it still provides much of the speed-up of full batch normalization. In addition, the computational overhead of our method is lower, permitting more optimization steps to be taken in the same amount of time.\n",
+    "\n",
+    "> https://arxiv.org/abs/1602.07868 \n",
+    "\n",
+    "<img src=\"https://raw.githubusercontent.com/seanpmorgan/tf-weightnorm/master/static/wrapped-graph.png\" width=\"80%\"><br><br>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MUXex9ctTuDB"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "CyWHXw9mQ6mp"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U tensorflow-addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "OywLbs7EXiE_"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow_addons as tfa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "KQMhhq1qXiFF"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from matplotlib import pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ULWHqMAnTVZD"
+   },
+   "outputs": [],
+   "source": [
+    "# Hyper Parameters\n",
+    "batch_size = 32\n",
+    "epochs = 10\n",
+    "num_classes = 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "fhM0ieDpSnKh"
+   },
+   "source": [
+    "## Build Models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "7XZXnBYgRPSk"
+   },
+   "outputs": [],
+   "source": [
+    "# Standard ConvNet\n",
+    "reg_model = tf.keras.Sequential(\n",
+    "    [\n",
+    "        tf.keras.layers.Conv2D(6, 5, activation=\"relu\"),\n",
+    "        tf.keras.layers.MaxPooling2D(2, 2),\n",
+    "        tf.keras.layers.Conv2D(16, 5, activation=\"relu\"),\n",
+    "        tf.keras.layers.MaxPooling2D(2, 2),\n",
+    "        tf.keras.layers.Flatten(),\n",
+    "        tf.keras.layers.Dense(120, activation=\"relu\"),\n",
+    "        tf.keras.layers.Dense(84, activation=\"relu\"),\n",
+    "        tf.keras.layers.Dense(num_classes, activation=\"softmax\"),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "UZd6V90eR4Gm"
+   },
+   "outputs": [],
+   "source": [
+    "# WeightNorm ConvNet\n",
+    "wn_model = tf.keras.Sequential(\n",
+    "    [\n",
+    "        tfa.layers.WeightNormalization(tf.keras.layers.Conv2D(6, 5, activation=\"relu\")),\n",
+    "        tf.keras.layers.MaxPooling2D(2, 2),\n",
+    "        tfa.layers.WeightNormalization(\n",
+    "            tf.keras.layers.Conv2D(16, 5, activation=\"relu\")\n",
+    "        ),\n",
+    "        tf.keras.layers.MaxPooling2D(2, 2),\n",
+    "        tf.keras.layers.Flatten(),\n",
+    "        tfa.layers.WeightNormalization(tf.keras.layers.Dense(120, activation=\"relu\")),\n",
+    "        tfa.layers.WeightNormalization(tf.keras.layers.Dense(84, activation=\"relu\")),\n",
+    "        tfa.layers.WeightNormalization(\n",
+    "            tf.keras.layers.Dense(num_classes, activation=\"softmax\")\n",
+    "        ),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "AA5dti8AS2Y7"
+   },
+   "source": [
+    "## Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "L8Isjc7W8MEn"
+   },
+   "outputs": [],
+   "source": [
+    "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()\n",
+    "\n",
+    "# Convert class vectors to binary class matrices.\n",
+    "y_train = tf.keras.utils.to_categorical(y_train, num_classes)\n",
+    "y_test = tf.keras.utils.to_categorical(y_test, num_classes)\n",
+    "\n",
+    "x_train = x_train.astype(\"float32\")\n",
+    "x_test = x_test.astype(\"float32\")\n",
+    "x_train /= 255\n",
+    "x_test /= 255"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "cH1CG9E7S34C"
+   },
+   "source": [
+    "## Train Models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "EvNKxfaI7vSm"
+   },
+   "outputs": [],
+   "source": [
+    "reg_model.compile(\n",
+    "    optimizer=\"adam\", loss=\"categorical_crossentropy\", metrics=[\"accuracy\"]\n",
+    ")\n",
+    "\n",
+    "reg_history = reg_model.fit(\n",
+    "    x_train,\n",
+    "    y_train,\n",
+    "    batch_size=batch_size,\n",
+    "    epochs=epochs,\n",
+    "    validation_data=(x_test, y_test),\n",
+    "    shuffle=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "esmMh-5g7wmp"
+   },
+   "outputs": [],
+   "source": [
+    "wn_model.compile(\n",
+    "    optimizer=\"adam\", loss=\"categorical_crossentropy\", metrics=[\"accuracy\"]\n",
+    ")\n",
+    "\n",
+    "wn_history = wn_model.fit(\n",
+    "    x_train,\n",
+    "    y_train,\n",
+    "    batch_size=batch_size,\n",
+    "    epochs=epochs,\n",
+    "    validation_data=(x_test, y_test),\n",
+    "    shuffle=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "yujf2YRbwX55"
+   },
+   "outputs": [],
+   "source": [
+    "reg_accuracy = reg_history.history[\"accuracy\"]\n",
+    "wn_accuracy = wn_history.history[\"accuracy\"]\n",
+    "\n",
+    "plt.plot(\n",
+    "    np.linspace(0, epochs, epochs), reg_accuracy, color=\"red\", label=\"Regular ConvNet\"\n",
+    ")\n",
+    "\n",
+    "plt.plot(\n",
+    "    np.linspace(0, epochs, epochs),\n",
+    "    wn_accuracy,\n",
+    "    color=\"blue\",\n",
+    "    label=\"WeightNorm ConvNet\",\n",
+    ")\n",
+    "\n",
+    "plt.title(\"WeightNorm Accuracy Comparison\")\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "layers_weightnormalization.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/losses_triplet.ipynb b/docs/tutorials/losses_triplet.ipynb
index fd29dfc898..73aba8d7bb 100644
--- a/docs/tutorials/losses_triplet.ipynb
+++ b/docs/tutorials/losses_triplet.ipynb
@@ -1,326 +1,341 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Tce3stUlHN0L"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "tuOe1ymfHZPu"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MfBg1C5NB3X0"
-      },
-      "source": [
-        "# TensorFlow Addons Losses: TripletSemiHardLoss\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/losses_triplet\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/losses_triplet.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/losses_triplet.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/losses_triplet.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xHxb-dlhMIzW"
-      },
-      "source": [
-        "## Overview\n",
-        "This notebook will demonstrate how to use the TripletSemiHardLoss function in TensorFlow Addons.\n",
-        "\n",
-        "### Resources:\n",
-        "* [FaceNet:  A Unified Embedding for Face Recognition and Clustering](https://arxiv.org/pdf/1503.03832.pdf)\n",
-        "* [Oliver Moindrot's blog does an excellent job of describing the algorithm in detail](https://omoindrot.github.io/triplet-loss)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "bQwBbFVAyHJ_"
-      },
-      "source": [
-        "## TripletLoss\n",
-        "\n",
-        "As first introduced in the FaceNet paper, TripletLoss is a loss function that trains a neural network to closely embed features of the same class while maximizing the distance between embeddings of different classes.  To do this an anchor  is chosen along with one negative and one positive sample.\n",
-        "![fig3](https://user-images.githubusercontent.com/18154355/61485418-1cbb1f00-a96f-11e9-8de8-3c46eef5a7dc.png)\n",
-        "\n",
-        "**The loss function is described as a Euclidean distance function:**\n",
-        "\n",
-        "![function](https://user-images.githubusercontent.com/18154355/61484709-7589b800-a96d-11e9-9c3c-e880514af4b7.png)\n",
-        "\n",
-        "Where A is our anchor input,  P is the positive sample input,  N is the negative sample input, and alpha is some margin you use to specify when a triplet has become too \"easy\" and you no longer want to adjust the weights from it."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "wPJ5521HZHeL"
-      },
-      "source": [
-        "## SemiHard Online Learning\n",
-        "As shown in the paper, the best results are from triplets known as \"Semi-Hard\". These are defined as triplets where the negative is farther from the anchor than the positive, but still produces a positive loss. To efficiently find these triplets you utilize online learning and only train from the Semi-Hard examples in each batch. \n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MUXex9ctTuDB"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "6Vyo25M2ba1P"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -U tensorflow-addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "IqR2PQG4ZaZ0"
-      },
-      "outputs": [],
-      "source": [
-        "import io\n",
-        "import numpy as np"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WH_7-ZYZYblV"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "import tensorflow_addons as tfa\n",
-        "import tensorflow_datasets as tfds"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0_D7CZqkv_Hj"
-      },
-      "source": [
-        "## Prepare the Data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "iXvByj6wcT7d"
-      },
-      "outputs": [],
-      "source": [
-        "def _normalize_img(img, label):\n",
-        "    img = tf.cast(img, tf.float32) / 255.\n",
-        "    return (img, label)\n",
-        "\n",
-        "train_dataset, test_dataset = tfds.load(name=\"mnist\", split=['train', 'test'], as_supervised=True)\n",
-        "\n",
-        "# Build your input pipelines\n",
-        "train_dataset = train_dataset.shuffle(1024).batch(32)\n",
-        "train_dataset = train_dataset.map(_normalize_img)\n",
-        "\n",
-        "test_dataset = test_dataset.batch(32)\n",
-        "test_dataset = test_dataset.map(_normalize_img)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KR01t9v_fxbT"
-      },
-      "source": [
-        "## Build the Model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "wvOPPuIKhLJi"
-      },
-      "source": [
-        "![fig2](https://user-images.githubusercontent.com/18154355/61485417-1cbb1f00-a96f-11e9-8d6a-94964ce8c4db.png)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "djpoAvfWNyL5"
-      },
-      "outputs": [],
-      "source": [
-        "model = tf.keras.Sequential([\n",
-        "    tf.keras.layers.Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=(28,28,1)),\n",
-        "    tf.keras.layers.MaxPooling2D(pool_size=2),\n",
-        "    tf.keras.layers.Dropout(0.3),\n",
-        "    tf.keras.layers.Conv2D(filters=32, kernel_size=2, padding='same', activation='relu'),\n",
-        "    tf.keras.layers.MaxPooling2D(pool_size=2),\n",
-        "    tf.keras.layers.Dropout(0.3),\n",
-        "    tf.keras.layers.Flatten(),\n",
-        "    tf.keras.layers.Dense(256, activation=None), # No activation on final dense layer\n",
-        "    tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1)) # L2 normalize embeddings\n",
-        "\n",
-        "])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "HYE-BxhOzFQp"
-      },
-      "source": [
-        "## Train and Evaluate"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NxfYhtiSzHf-"
-      },
-      "outputs": [],
-      "source": [
-        "# Compile the model\n",
-        "model.compile(\n",
-        "    optimizer=tf.keras.optimizers.Adam(0.001),\n",
-        "    loss=tfa.losses.TripletSemiHardLoss())\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "TGBYNGxgVDrj"
-      },
-      "outputs": [],
-      "source": [
-        "# Train the network\n",
-        "history = model.fit(\n",
-        "    train_dataset,\n",
-        "    epochs=5)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "1Y--0tK69SXf"
-      },
-      "outputs": [],
-      "source": [
-        "# Evaluate the network\n",
-        "results = model.predict(test_dataset)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "dqSuLdVZGNrZ"
-      },
-      "outputs": [],
-      "source": [
-        "# Save test embeddings for visualization in projector\n",
-        "np.savetxt(\"vecs.tsv\", results, delimiter='\\t')\n",
-        "\n",
-        "out_m = io.open('meta.tsv', 'w', encoding='utf-8')\n",
-        "for img, labels in tfds.as_numpy(test_dataset):\n",
-        "    [out_m.write(str(x) + \"\\n\") for x in labels]\n",
-        "out_m.close()\n",
-        "\n",
-        "\n",
-        "try:\n",
-        "  from google.colab import files\n",
-        "  files.download('vecs.tsv')\n",
-        "  files.download('meta.tsv')\n",
-        "except:\n",
-        "  pass"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "VAtj_m6Z_Uwe"
-      },
-      "source": [
-        "## Embedding Projector"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Y4rjlG9rlbVA"
-      },
-      "source": [
-        "The vector and metadata files can be loaded and visualized here: https://projector.tensorflow.org/\n",
-        "\n",
-        "You can see the results of our embedded test data when visualized with UMAP:\n",
-        "![embedding](https://user-images.githubusercontent.com/18154355/61600295-e6470380-abfd-11e9-8a00-2b25e7e6916f.png)\n"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "losses_triplet.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Tce3stUlHN0L"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors.\n"
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "tuOe1ymfHZPu"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MfBg1C5NB3X0"
+   },
+   "source": [
+    "# TensorFlow Addons Losses: TripletSemiHardLoss\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/losses_triplet\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/losses_triplet.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/losses_triplet.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/losses_triplet.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "xHxb-dlhMIzW"
+   },
+   "source": [
+    "## Overview\n",
+    "This notebook will demonstrate how to use the TripletSemiHardLoss function in TensorFlow Addons.\n",
+    "\n",
+    "### Resources:\n",
+    "* [FaceNet:  A Unified Embedding for Face Recognition and Clustering](https://arxiv.org/pdf/1503.03832.pdf)\n",
+    "* [Oliver Moindrot's blog does an excellent job of describing the algorithm in detail](https://omoindrot.github.io/triplet-loss)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "bQwBbFVAyHJ_"
+   },
+   "source": [
+    "## TripletLoss\n",
+    "\n",
+    "As first introduced in the FaceNet paper, TripletLoss is a loss function that trains a neural network to closely embed features of the same class while maximizing the distance between embeddings of different classes.  To do this an anchor  is chosen along with one negative and one positive sample.\n",
+    "![fig3](https://user-images.githubusercontent.com/18154355/61485418-1cbb1f00-a96f-11e9-8de8-3c46eef5a7dc.png)\n",
+    "\n",
+    "**The loss function is described as a Euclidean distance function:**\n",
+    "\n",
+    "![function](https://user-images.githubusercontent.com/18154355/61484709-7589b800-a96d-11e9-9c3c-e880514af4b7.png)\n",
+    "\n",
+    "Where A is our anchor input,  P is the positive sample input,  N is the negative sample input, and alpha is some margin you use to specify when a triplet has become too \"easy\" and you no longer want to adjust the weights from it."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wPJ5521HZHeL"
+   },
+   "source": [
+    "## SemiHard Online Learning\n",
+    "As shown in the paper, the best results are from triplets known as \"Semi-Hard\". These are defined as triplets where the negative is farther from the anchor than the positive, but still produces a positive loss. To efficiently find these triplets you utilize online learning and only train from the Semi-Hard examples in each batch. \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MUXex9ctTuDB"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "6Vyo25M2ba1P"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U tensorflow-addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "IqR2PQG4ZaZ0"
+   },
+   "outputs": [],
+   "source": [
+    "import io\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "WH_7-ZYZYblV"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow_addons as tfa\n",
+    "import tensorflow_datasets as tfds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0_D7CZqkv_Hj"
+   },
+   "source": [
+    "## Prepare the Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "iXvByj6wcT7d"
+   },
+   "outputs": [],
+   "source": [
+    "def _normalize_img(img, label):\n",
+    "    img = tf.cast(img, tf.float32) / 255.0\n",
+    "    return (img, label)\n",
+    "\n",
+    "\n",
+    "train_dataset, test_dataset = tfds.load(\n",
+    "    name=\"mnist\", split=[\"train\", \"test\"], as_supervised=True\n",
+    ")\n",
+    "\n",
+    "# Build your input pipelines\n",
+    "train_dataset = train_dataset.shuffle(1024).batch(32)\n",
+    "train_dataset = train_dataset.map(_normalize_img)\n",
+    "\n",
+    "test_dataset = test_dataset.batch(32)\n",
+    "test_dataset = test_dataset.map(_normalize_img)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "KR01t9v_fxbT"
+   },
+   "source": [
+    "## Build the Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wvOPPuIKhLJi"
+   },
+   "source": [
+    "![fig2](https://user-images.githubusercontent.com/18154355/61485417-1cbb1f00-a96f-11e9-8d6a-94964ce8c4db.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "djpoAvfWNyL5"
+   },
+   "outputs": [],
+   "source": [
+    "model = tf.keras.Sequential(\n",
+    "    [\n",
+    "        tf.keras.layers.Conv2D(\n",
+    "            filters=64,\n",
+    "            kernel_size=2,\n",
+    "            padding=\"same\",\n",
+    "            activation=\"relu\",\n",
+    "            input_shape=(28, 28, 1),\n",
+    "        ),\n",
+    "        tf.keras.layers.MaxPooling2D(pool_size=2),\n",
+    "        tf.keras.layers.Dropout(0.3),\n",
+    "        tf.keras.layers.Conv2D(\n",
+    "            filters=32, kernel_size=2, padding=\"same\", activation=\"relu\"\n",
+    "        ),\n",
+    "        tf.keras.layers.MaxPooling2D(pool_size=2),\n",
+    "        tf.keras.layers.Dropout(0.3),\n",
+    "        tf.keras.layers.Flatten(),\n",
+    "        tf.keras.layers.Dense(\n",
+    "            256, activation=None\n",
+    "        ),  # No activation on final dense layer\n",
+    "        tf.keras.layers.Lambda(\n",
+    "            lambda x: tf.math.l2_normalize(x, axis=1)\n",
+    "        ),  # L2 normalize embeddings\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HYE-BxhOzFQp"
+   },
+   "source": [
+    "## Train and Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NxfYhtiSzHf-"
+   },
+   "outputs": [],
+   "source": [
+    "# Compile the model\n",
+    "model.compile(\n",
+    "    optimizer=tf.keras.optimizers.Adam(0.001), loss=tfa.losses.TripletSemiHardLoss()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "TGBYNGxgVDrj"
+   },
+   "outputs": [],
+   "source": [
+    "# Train the network\n",
+    "history = model.fit(train_dataset, epochs=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "1Y--0tK69SXf"
+   },
+   "outputs": [],
+   "source": [
+    "# Evaluate the network\n",
+    "results = model.predict(test_dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "dqSuLdVZGNrZ"
+   },
+   "outputs": [],
+   "source": [
+    "# Save test embeddings for visualization in projector\n",
+    "np.savetxt(\"vecs.tsv\", results, delimiter=\"\\t\")\n",
+    "\n",
+    "out_m = io.open(\"meta.tsv\", \"w\", encoding=\"utf-8\")\n",
+    "for img, labels in tfds.as_numpy(test_dataset):\n",
+    "    [out_m.write(str(x) + \"\\n\") for x in labels]\n",
+    "out_m.close()\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    from google.colab import files\n",
+    "\n",
+    "    files.download(\"vecs.tsv\")\n",
+    "    files.download(\"meta.tsv\")\n",
+    "except:\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "VAtj_m6Z_Uwe"
+   },
+   "source": [
+    "## Embedding Projector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Y4rjlG9rlbVA"
+   },
+   "source": [
+    "The vector and metadata files can be loaded and visualized here: https://projector.tensorflow.org/\n",
+    "\n",
+    "You can see the results of our embedded test data when visualized with UMAP:\n",
+    "![embedding](https://user-images.githubusercontent.com/18154355/61600295-e6470380-abfd-11e9-8a00-2b25e7e6916f.png)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "losses_triplet.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/networks_seq2seq_nmt.ipynb b/docs/tutorials/networks_seq2seq_nmt.ipynb
index 371e7b1e65..06efc31fee 100644
--- a/docs/tutorials/networks_seq2seq_nmt.ipynb
+++ b/docs/tutorials/networks_seq2seq_nmt.ipynb
@@ -1,1105 +1,1205 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "5aElYAKlV2Mi"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "wmYJlt6LWVOU"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "L-8q8rRRWcp6"
-      },
-      "source": [
-        "# TensorFlow Addons Networks : Sequence-to-Sequence NMT with Attention Mechanism\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/networks_seq2seq_nmt\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/networks_seq2seq_nmt.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/networks_seq2seq_nmt.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "      <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/networks_seq2seq_nmt.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9n0dcDw1Wszw"
-      },
-      "source": [
-        "## Overview\n",
-        "This notebook gives a brief introduction into the ***Sequence to Sequence Model Architecture***\n",
-        "In this noteboook you broadly cover four essential topics necessary for Neural Machine Translation:\n",
-        "\n",
-        "\n",
-        "* **Data cleaning**\n",
-        "* **Data preparation**\n",
-        "* **Neural Translation Model with Attention**\n",
-        "* **Final Translation with ```tf.addons.seq2seq.BasicDecoder``` and ```tf.addons.seq2seq.BeamSearchDecoder```** \n",
-        "\n",
-        "The basic idea behind such a model though, is only the encoder-decoder architecture. These networks are usually used for a variety of tasks like text-summerization, Machine translation, Image Captioning, etc. This tutorial provideas a hands-on understanding of the concept, explaining the technical jargons wherever necessary. You focus on the task of Neural Machine Translation (NMT) which was the very first testbed for seq2seq models.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MpySVYWJhxaV"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "_kxfdP4hJUPB"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Collecting tensorflow-addons==0.11.2\n",
-            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b3/f8/d6fca180c123f2851035c4493690662ebdad0849a9059d56035434bff5c9/tensorflow_addons-0.11.2-cp36-cp36m-manylinux2010_x86_64.whl (1.1MB)\n",
-            "\u001b[K     |████████████████████████████████| 1.1MB 4.4MB/s \n",
-            "\u001b[?25hRequirement already satisfied: typeguard>=2.7 in /usr/local/lib/python3.6/dist-packages (from tensorflow-addons==0.11.2) (2.7.1)\n",
-            "Installing collected packages: tensorflow-addons\n",
-            "  Found existing installation: tensorflow-addons 0.11.0\n",
-            "    Uninstalling tensorflow-addons-0.11.0:\n",
-            "      Successfully uninstalled tensorflow-addons-0.11.0\n",
-            "Successfully installed tensorflow-addons-0.11.2\n"
-          ]
-        }
-      ],
-      "source": [
-        "!pip install tensorflow-addons==0.11.2"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "tnxXKDjq3jEL"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "import tensorflow_addons as tfa\n",
-        "\n",
-        "import matplotlib.pyplot as plt\n",
-        "import matplotlib.ticker as ticker\n",
-        "from sklearn.model_selection import train_test_split\n",
-        "\n",
-        "import unicodedata\n",
-        "import re\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import io\n",
-        "import time\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Ii_vg-XNXTil"
-      },
-      "source": [
-        "## Data Cleaning and Data Preparation \n",
-        "\n",
-        "You'll use a language dataset provided by http://www.manythings.org/anki/. This dataset contains language translation pairs in the format:\n",
-        "\n",
-        "---\n",
-        "      May I borrow this book?    ¿Puedo tomar prestado este libro?\n",
-        "---\n",
-        "\n",
-        "\n",
-        "There are a variety of languages available, but you'll use the English-Spanish dataset. After downloading the dataset, here are the steps you'll take to prepare the data:\n",
-        "\n",
-        "\n",
-        "1. Add a start and end token to each sentence.\n",
-        "2. Clean the sentences by removing special characters.\n",
-        "3. Create a Vocabulary with word index (mapping from word → id) and reverse word index (mapping from id → word).\n",
-        "5. Pad each sentence to a maximum length. (Why? you need to fix the maximum length for the inputs to recurrent encoders)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "PvRnGWnvXm6l"
-      },
-      "outputs": [],
-      "source": [
-        "def download_nmt():\n",
-        "    path_to_zip = tf.keras.utils.get_file(\n",
-        "    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',\n",
-        "    extract=True)\n",
-        "\n",
-        "    path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\"\n",
-        "    return path_to_file\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "NFKB2c_tX4wU"
-      },
-      "source": [
-        "### Define a NMTDataset class with necessary functions to follow Step 1 to Step 4. \n",
-        "The ```call()``` will return:\n",
-        "1. ```train_dataset```  and ```val_dataset``` : ```tf.data.Dataset``` objects\n",
-        "2. ```inp_lang_tokenizer``` and ```targ_lang_tokenizer``` : ```tf.keras.preprocessing.text.Tokenizer``` objects "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "JMAHz7kJXc5N"
-      },
-      "outputs": [],
-      "source": [
-        "class NMTDataset:\n",
-        "    def __init__(self, problem_type='en-spa'):\n",
-        "        self.problem_type = 'en-spa'\n",
-        "        self.inp_lang_tokenizer = None\n",
-        "        self.targ_lang_tokenizer = None\n",
-        "    \n",
-        "\n",
-        "    def unicode_to_ascii(self, s):\n",
-        "        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')\n",
-        "\n",
-        "    ## Step 1 and Step 2 \n",
-        "    def preprocess_sentence(self, w):\n",
-        "        w = self.unicode_to_ascii(w.lower().strip())\n",
-        "\n",
-        "        # creating a space between a word and the punctuation following it\n",
-        "        # eg: \"he is a boy.\" => \"he is a boy .\"\n",
-        "        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation\n",
-        "        w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n",
-        "        w = re.sub(r'[\" \"]+', \" \", w)\n",
-        "\n",
-        "        # replacing everything with space except (a-z, A-Z, \".\", \"?\", \"!\", \",\")\n",
-        "        w = re.sub(r\"[^a-zA-Z?.!,¿]+\", \" \", w)\n",
-        "\n",
-        "        w = w.strip()\n",
-        "\n",
-        "        # adding a start and an end token to the sentence\n",
-        "        # so that the model know when to start and stop predicting.\n",
-        "        w = '<start> ' + w + ' <end>'\n",
-        "        return w\n",
-        "    \n",
-        "    def create_dataset(self, path, num_examples):\n",
-        "        # path : path to spa-eng.txt file\n",
-        "        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)\n",
-        "        lines = io.open(path, encoding='UTF-8').read().strip().split('\\n')\n",
-        "        word_pairs = [[self.preprocess_sentence(w) for w in l.split('\\t')]  for l in lines[:num_examples]]\n",
-        "\n",
-        "        return zip(*word_pairs)\n",
-        "\n",
-        "    # Step 3 and Step 4\n",
-        "    def tokenize(self, lang):\n",
-        "        # lang = list of sentences in a language\n",
-        "        \n",
-        "        # print(len(lang), \"example sentence: {}\".format(lang[0]))\n",
-        "        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')\n",
-        "        lang_tokenizer.fit_on_texts(lang)\n",
-        "\n",
-        "        ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) \n",
-        "        ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)\n",
-        "        tensor = lang_tokenizer.texts_to_sequences(lang) \n",
-        "\n",
-        "        ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences \n",
-        "        ## and pads the sequences to match the longest sequences in the given input\n",
-        "        tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')\n",
-        "\n",
-        "        return tensor, lang_tokenizer\n",
-        "\n",
-        "    def load_dataset(self, path, num_examples=None):\n",
-        "        # creating cleaned input, output pairs\n",
-        "        targ_lang, inp_lang = self.create_dataset(path, num_examples)\n",
-        "\n",
-        "        input_tensor, inp_lang_tokenizer = self.tokenize(inp_lang)\n",
-        "        target_tensor, targ_lang_tokenizer = self.tokenize(targ_lang)\n",
-        "\n",
-        "        return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer\n",
-        "\n",
-        "    def call(self, num_examples, BUFFER_SIZE, BATCH_SIZE):\n",
-        "        file_path = download_nmt()\n",
-        "        input_tensor, target_tensor, self.inp_lang_tokenizer, self.targ_lang_tokenizer = self.load_dataset(file_path, num_examples)\n",
-        "        \n",
-        "        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)\n",
-        "\n",
-        "        train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))\n",
-        "        train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)\n",
-        "\n",
-        "        val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))\n",
-        "        val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)\n",
-        "\n",
-        "        return train_dataset, val_dataset, self.inp_lang_tokenizer, self.targ_lang_tokenizer"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "EIW4NVBmJ25k"
-      },
-      "outputs": [],
-      "source": [
-        "BUFFER_SIZE = 32000\n",
-        "BATCH_SIZE = 64\n",
-        "# Let's limit the #training examples for faster training\n",
-        "num_examples = 30000\n",
-        "\n",
-        "dataset_creator = NMTDataset('en-spa')\n",
-        "train_dataset, val_dataset, inp_lang, targ_lang = dataset_creator.call(num_examples, BUFFER_SIZE, BATCH_SIZE)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "w2lCTy4vKOkB"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "(TensorShape([64, 16]), TensorShape([64, 11]))"
-            ]
-          },
-          "execution_count": 7,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "example_input_batch, example_target_batch = next(iter(train_dataset))\n",
-        "example_input_batch.shape, example_target_batch.shape"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rgCLkfv5uO3d"
-      },
-      "source": [
-        "### Some important parameters"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "TqHsArVZ3jFS"
-      },
-      "outputs": [],
-      "source": [
-        "vocab_inp_size = len(inp_lang.word_index)+1\n",
-        "vocab_tar_size = len(targ_lang.word_index)+1\n",
-        "max_length_input = example_input_batch.shape[1]\n",
-        "max_length_output = example_target_batch.shape[1]\n",
-        "\n",
-        "embedding_dim = 256\n",
-        "units = 1024\n",
-        "steps_per_epoch = num_examples//BATCH_SIZE\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "g-yY9c6aIu1h"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "max_length_spanish, max_length_english, vocab_size_spanish, vocab_size_english\n"
-          ]
-        },
-        {
-          "data": {
-            "text/plain": [
-              "(16, 11, 9415, 4936)"
-            ]
-          },
-          "execution_count": 9,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "print(\"max_length_english, max_length_spanish, vocab_size_english, vocab_size_spanish\")\n",
-        "max_length_input, max_length_output, vocab_inp_size, vocab_tar_size"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "nZ2rI24i3jFg"
-      },
-      "outputs": [],
-      "source": [
-        "##### \n",
-        "\n",
-        "class Encoder(tf.keras.Model):\n",
-        "  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n",
-        "    super(Encoder, self).__init__()\n",
-        "    self.batch_sz = batch_sz\n",
-        "    self.enc_units = enc_units\n",
-        "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-        "\n",
-        "    ##-------- LSTM layer in Encoder ------- ##\n",
-        "    self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,\n",
-        "                                   return_sequences=True,\n",
-        "                                   return_state=True,\n",
-        "                                   recurrent_initializer='glorot_uniform')\n",
-        "    \n",
-        "\n",
-        "\n",
-        "  def call(self, x, hidden):\n",
-        "    x = self.embedding(x)\n",
-        "    output, h, c = self.lstm_layer(x, initial_state = hidden)\n",
-        "    return output, h, c\n",
-        "\n",
-        "  def initialize_hidden_state(self):\n",
-        "    return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))] "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "60gSVh05Jl6l"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Encoder output shape: (batch size, sequence length, units) (64, 16, 1024)\n",
-            "Encoder h vecotr shape: (batch size, units) (64, 1024)\n",
-            "Encoder c vector shape: (batch size, units) (64, 1024)\n"
-          ]
-        }
-      ],
-      "source": [
-        "## Test Encoder Stack\n",
-        "\n",
-        "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)\n",
-        "\n",
-        "\n",
-        "# sample input\n",
-        "sample_hidden = encoder.initialize_hidden_state()\n",
-        "sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)\n",
-        "print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))\n",
-        "print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))\n",
-        "print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "yJ_B3mhW3jFk"
-      },
-      "outputs": [],
-      "source": [
-        "class Decoder(tf.keras.Model):\n",
-        "  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):\n",
-        "    super(Decoder, self).__init__()\n",
-        "    self.batch_sz = batch_sz\n",
-        "    self.dec_units = dec_units\n",
-        "    self.attention_type = attention_type\n",
-        "    \n",
-        "    # Embedding Layer\n",
-        "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-        "    \n",
-        "    #Final Dense layer on which softmax will be applied\n",
-        "    self.fc = tf.keras.layers.Dense(vocab_size)\n",
-        "\n",
-        "    # Define the fundamental cell for decoder recurrent structure\n",
-        "    self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)\n",
-        "   \n",
-        "\n",
-        "\n",
-        "    # Sampler\n",
-        "    self.sampler = tfa.seq2seq.sampler.TrainingSampler()\n",
-        "\n",
-        "    # Create attention mechanism with memory = None\n",
-        "    self.attention_mechanism = self.build_attention_mechanism(self.dec_units, \n",
-        "                                                              None, self.batch_sz*[max_length_input], self.attention_type)\n",
-        "\n",
-        "    # Wrap attention mechanism with the fundamental rnn cell of decoder\n",
-        "    self.rnn_cell = self.build_rnn_cell(batch_sz)\n",
-        "\n",
-        "    # Define the decoder with respect to fundamental rnn cell\n",
-        "    self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)\n",
-        "\n",
-        "    \n",
-        "  def build_rnn_cell(self, batch_sz):\n",
-        "    rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, \n",
-        "                                  self.attention_mechanism, attention_layer_size=self.dec_units)\n",
-        "    return rnn_cell\n",
-        "\n",
-        "  def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):\n",
-        "    # ------------- #\n",
-        "    # typ: Which sort of attention (Bahdanau, Luong)\n",
-        "    # dec_units: final dimension of attention outputs \n",
-        "    # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)\n",
-        "    # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)\n",
-        "\n",
-        "    if(attention_type=='bahdanau'):\n",
-        "      return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)\n",
-        "    else:\n",
-        "      return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)\n",
-        "\n",
-        "  def build_initial_state(self, batch_sz, encoder_state, Dtype):\n",
-        "    decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)\n",
-        "    decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)\n",
-        "    return decoder_initial_state\n",
-        "\n",
-        "\n",
-        "  def call(self, inputs, initial_state):\n",
-        "    x = self.embedding(inputs)\n",
-        "    outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_output-1])\n",
-        "    return outputs\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "DaiO0Z6_Ml1c"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Decoder Outputs Shape:  (64, 10, 4936)\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Test decoder stack\n",
-        "\n",
-        "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, 'luong')\n",
-        "sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))\n",
-        "decoder.attention_mechanism.setup_memory(sample_output)\n",
-        "initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)\n",
-        "\n",
-        "\n",
-        "sample_decoder_outputs = decoder(sample_x, initial_state)\n",
-        "\n",
-        "print(\"Decoder Outputs Shape: \", sample_decoder_outputs.rnn_output.shape)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_ch_71VbIRfK"
-      },
-      "source": [
-        "## Define the optimizer and the loss function"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WmTHr5iV3jFr"
-      },
-      "outputs": [],
-      "source": [
-        "optimizer = tf.keras.optimizers.Adam()\n",
-        "\n",
-        "\n",
-        "def loss_function(real, pred):\n",
-        "  # real shape = (BATCH_SIZE, max_length_output)\n",
-        "  # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )\n",
-        "  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')\n",
-        "  loss = cross_entropy(y_true=real, y_pred=pred)\n",
-        "  mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1\n",
-        "  mask = tf.cast(mask, dtype=loss.dtype)  \n",
-        "  loss = mask* loss\n",
-        "  loss = tf.reduce_mean(loss)\n",
-        "  return loss  "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "DMVWzzsfNl4e"
-      },
-      "source": [
-        "## Checkpoints (Object-based saving)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Zj8bXQTgNwrF"
-      },
-      "outputs": [],
-      "source": [
-        "checkpoint_dir = './training_checkpoints'\n",
-        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
-        "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n",
-        "                                 encoder=encoder,\n",
-        "                                 decoder=decoder)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8Bw95utNiFHa"
-      },
-      "source": [
-        "## One train_step operations"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "sC9ArXSsVfqn"
-      },
-      "outputs": [],
-      "source": [
-        "@tf.function\n",
-        "def train_step(inp, targ, enc_hidden):\n",
-        "  loss = 0\n",
-        "\n",
-        "  with tf.GradientTape() as tape:\n",
-        "    enc_output, enc_h, enc_c = encoder(inp, enc_hidden)\n",
-        "\n",
-        "\n",
-        "    dec_input = targ[ : , :-1 ] # Ignore <end> token\n",
-        "    real = targ[ : , 1: ]         # ignore <start> token\n",
-        "\n",
-        "    # Set the AttentionMechanism object with encoder_outputs\n",
-        "    decoder.attention_mechanism.setup_memory(enc_output)\n",
-        "\n",
-        "    # Create AttentionWrapperState as initial_state for decoder\n",
-        "    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)\n",
-        "    pred = decoder(dec_input, decoder_initial_state)\n",
-        "    logits = pred.rnn_output\n",
-        "    loss = loss_function(real, logits)\n",
-        "\n",
-        "  variables = encoder.trainable_variables + decoder.trainable_variables\n",
-        "  gradients = tape.gradient(loss, variables)\n",
-        "  optimizer.apply_gradients(zip(gradients, variables))\n",
-        "\n",
-        "  return loss"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pey8eb9piMMg"
-      },
-      "source": [
-        "## Train the model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ddefjBMa3jF0"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Epoch 1 Batch 0 Loss 5.1692\n",
-            "Epoch 1 Batch 100 Loss 2.2288\n",
-            "Epoch 1 Batch 200 Loss 1.9930\n",
-            "Epoch 1 Batch 300 Loss 1.7783\n",
-            "Epoch 1 Loss 1.6975\n",
-            "Time taken for 1 epoch 37.26002788543701 sec\n",
-            "\n",
-            "Epoch 2 Batch 0 Loss 1.6408\n",
-            "Epoch 2 Batch 100 Loss 1.5767\n",
-            "Epoch 2 Batch 200 Loss 1.4054\n",
-            "Epoch 2 Batch 300 Loss 1.3755\n",
-            "Epoch 2 Loss 1.1412\n",
-            "Time taken for 1 epoch 30.0094051361084 sec\n",
-            "\n",
-            "Epoch 3 Batch 0 Loss 1.0296\n",
-            "Epoch 3 Batch 100 Loss 1.0306\n",
-            "Epoch 3 Batch 200 Loss 1.0675\n",
-            "Epoch 3 Batch 300 Loss 0.9574\n",
-            "Epoch 3 Loss 0.8037\n",
-            "Time taken for 1 epoch 28.983767986297607 sec\n",
-            "\n",
-            "Epoch 4 Batch 0 Loss 0.5923\n",
-            "Epoch 4 Batch 100 Loss 0.7533\n",
-            "Epoch 4 Batch 200 Loss 0.7397\n",
-            "Epoch 4 Batch 300 Loss 0.6779\n",
-            "Epoch 4 Loss 0.5419\n",
-            "Time taken for 1 epoch 29.649972200393677 sec\n",
-            "\n",
-            "Epoch 5 Batch 0 Loss 0.4320\n",
-            "Epoch 5 Batch 100 Loss 0.4349\n",
-            "Epoch 5 Batch 200 Loss 0.4686\n",
-            "Epoch 5 Batch 300 Loss 0.4748\n",
-            "Epoch 5 Loss 0.3827\n",
-            "Time taken for 1 epoch 29.06334638595581 sec\n",
-            "\n",
-            "Epoch 6 Batch 0 Loss 0.3422\n",
-            "Epoch 6 Batch 100 Loss 0.3052\n",
-            "Epoch 6 Batch 200 Loss 0.3288\n",
-            "Epoch 6 Batch 300 Loss 0.3216\n",
-            "Epoch 6 Loss 0.2814\n",
-            "Time taken for 1 epoch 29.57170796394348 sec\n",
-            "\n",
-            "Epoch 7 Batch 0 Loss 0.2129\n",
-            "Epoch 7 Batch 100 Loss 0.2382\n",
-            "Epoch 7 Batch 200 Loss 0.2406\n",
-            "Epoch 7 Batch 300 Loss 0.2792\n",
-            "Epoch 7 Loss 0.2162\n",
-            "Time taken for 1 epoch 28.95500087738037 sec\n",
-            "\n",
-            "Epoch 8 Batch 0 Loss 0.2073\n",
-            "Epoch 8 Batch 100 Loss 0.2095\n",
-            "Epoch 8 Batch 200 Loss 0.1962\n",
-            "Epoch 8 Batch 300 Loss 0.1879\n",
-            "Epoch 8 Loss 0.1794\n",
-            "Time taken for 1 epoch 29.70877432823181 sec\n",
-            "\n",
-            "Epoch 9 Batch 0 Loss 0.1517\n",
-            "Epoch 9 Batch 100 Loss 0.2231\n",
-            "Epoch 9 Batch 200 Loss 0.2203\n",
-            "Epoch 9 Batch 300 Loss 0.2282\n",
-            "Epoch 9 Loss 0.1496\n",
-            "Time taken for 1 epoch 29.20821261405945 sec\n",
-            "\n",
-            "Epoch 10 Batch 0 Loss 0.1204\n",
-            "Epoch 10 Batch 100 Loss 0.1370\n",
-            "Epoch 10 Batch 200 Loss 0.1778\n",
-            "Epoch 10 Batch 300 Loss 0.2069\n",
-            "Epoch 10 Loss 0.1316\n",
-            "Time taken for 1 epoch 29.576894283294678 sec\n",
-            "\n"
-          ]
-        }
-      ],
-      "source": [
-        "EPOCHS = 10\n",
-        "\n",
-        "for epoch in range(EPOCHS):\n",
-        "  start = time.time()\n",
-        "\n",
-        "  enc_hidden = encoder.initialize_hidden_state()\n",
-        "  total_loss = 0\n",
-        "  # print(enc_hidden[0].shape, enc_hidden[1].shape)\n",
-        "\n",
-        "  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):\n",
-        "    batch_loss = train_step(inp, targ, enc_hidden)\n",
-        "    total_loss += batch_loss\n",
-        "\n",
-        "    if batch % 100 == 0:\n",
-        "      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,\n",
-        "                                                   batch,\n",
-        "                                                   batch_loss.numpy()))\n",
-        "  # saving (checkpoint) the model every 2 epochs\n",
-        "  if (epoch + 1) % 2 == 0:\n",
-        "    checkpoint.save(file_prefix = checkpoint_prefix)\n",
-        "\n",
-        "  print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n",
-        "                                      total_loss / steps_per_epoch))\n",
-        "  print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ]
-    },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5aElYAKlV2Mi"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "wmYJlt6LWVOU"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "L-8q8rRRWcp6"
+   },
+   "source": [
+    "# TensorFlow Addons Networks : Sequence-to-Sequence NMT with Attention Mechanism\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/networks_seq2seq_nmt\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/networks_seq2seq_nmt.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/networks_seq2seq_nmt.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "      <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/networks_seq2seq_nmt.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "9n0dcDw1Wszw"
+   },
+   "source": [
+    "## Overview\n",
+    "This notebook gives a brief introduction into the ***Sequence to Sequence Model Architecture***\n",
+    "In this noteboook you broadly cover four essential topics necessary for Neural Machine Translation:\n",
+    "\n",
+    "\n",
+    "* **Data cleaning**\n",
+    "* **Data preparation**\n",
+    "* **Neural Translation Model with Attention**\n",
+    "* **Final Translation with ```tf.addons.seq2seq.BasicDecoder``` and ```tf.addons.seq2seq.BeamSearchDecoder```** \n",
+    "\n",
+    "The basic idea behind such a model though, is only the encoder-decoder architecture. These networks are usually used for a variety of tasks like text-summerization, Machine translation, Image Captioning, etc. This tutorial provideas a hands-on understanding of the concept, explaining the technical jargons wherever necessary. You focus on the task of Neural Machine Translation (NMT) which was the very first testbed for seq2seq models.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MpySVYWJhxaV"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_kxfdP4hJUPB"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mU3Ce8M6I3rz"
-      },
-      "source": [
-        "## Use tf-addons BasicDecoder for decoding\n"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting tensorflow-addons==0.11.2\n",
+      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b3/f8/d6fca180c123f2851035c4493690662ebdad0849a9059d56035434bff5c9/tensorflow_addons-0.11.2-cp36-cp36m-manylinux2010_x86_64.whl (1.1MB)\n",
+      "\u001b[K     |████████████████████████████████| 1.1MB 4.4MB/s \n",
+      "\u001b[?25hRequirement already satisfied: typeguard>=2.7 in /usr/local/lib/python3.6/dist-packages (from tensorflow-addons==0.11.2) (2.7.1)\n",
+      "Installing collected packages: tensorflow-addons\n",
+      "  Found existing installation: tensorflow-addons 0.11.0\n",
+      "    Uninstalling tensorflow-addons-0.11.0:\n",
+      "      Successfully uninstalled tensorflow-addons-0.11.0\n",
+      "Successfully installed tensorflow-addons-0.11.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install tensorflow-addons==0.11.2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "tnxXKDjq3jEL"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow_addons as tfa\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.ticker as ticker\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "import unicodedata\n",
+    "import re\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import io\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Ii_vg-XNXTil"
+   },
+   "source": [
+    "## Data Cleaning and Data Preparation \n",
+    "\n",
+    "You'll use a language dataset provided by http://www.manythings.org/anki/. This dataset contains language translation pairs in the format:\n",
+    "\n",
+    "---\n",
+    "      May I borrow this book?    ¿Puedo tomar prestado este libro?\n",
+    "---\n",
+    "\n",
+    "\n",
+    "There are a variety of languages available, but you'll use the English-Spanish dataset. After downloading the dataset, here are the steps you'll take to prepare the data:\n",
+    "\n",
+    "\n",
+    "1. Add a start and end token to each sentence.\n",
+    "2. Clean the sentences by removing special characters.\n",
+    "3. Create a Vocabulary with word index (mapping from word → id) and reverse word index (mapping from id → word).\n",
+    "5. Pad each sentence to a maximum length. (Why? you need to fix the maximum length for the inputs to recurrent encoders)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "PvRnGWnvXm6l"
+   },
+   "outputs": [],
+   "source": [
+    "def download_nmt():\n",
+    "    path_to_zip = tf.keras.utils.get_file(\n",
+    "        \"spa-eng.zip\",\n",
+    "        origin=\"http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip\",\n",
+    "        extract=True,\n",
+    "    )\n",
+    "\n",
+    "    path_to_file = os.path.dirname(path_to_zip) + \"/spa-eng/spa.txt\"\n",
+    "    return path_to_file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "NFKB2c_tX4wU"
+   },
+   "source": [
+    "### Define a NMTDataset class with necessary functions to follow Step 1 to Step 4. \n",
+    "The ```call()``` will return:\n",
+    "1. ```train_dataset```  and ```val_dataset``` : ```tf.data.Dataset``` objects\n",
+    "2. ```inp_lang_tokenizer``` and ```targ_lang_tokenizer``` : ```tf.keras.preprocessing.text.Tokenizer``` objects "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "JMAHz7kJXc5N"
+   },
+   "outputs": [],
+   "source": [
+    "class NMTDataset:\n",
+    "    def __init__(self, problem_type=\"en-spa\"):\n",
+    "        self.problem_type = \"en-spa\"\n",
+    "        self.inp_lang_tokenizer = None\n",
+    "        self.targ_lang_tokenizer = None\n",
+    "\n",
+    "    def unicode_to_ascii(self, s):\n",
+    "        return \"\".join(\n",
+    "            c\n",
+    "            for c in unicodedata.normalize(\"NFD\", s)\n",
+    "            if unicodedata.category(c) != \"Mn\"\n",
+    "        )\n",
+    "\n",
+    "    ## Step 1 and Step 2\n",
+    "    def preprocess_sentence(self, w):\n",
+    "        w = self.unicode_to_ascii(w.lower().strip())\n",
+    "\n",
+    "        # creating a space between a word and the punctuation following it\n",
+    "        # eg: \"he is a boy.\" => \"he is a boy .\"\n",
+    "        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation\n",
+    "        w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n",
+    "        w = re.sub(r'[\" \"]+', \" \", w)\n",
+    "\n",
+    "        # replacing everything with space except (a-z, A-Z, \".\", \"?\", \"!\", \",\")\n",
+    "        w = re.sub(r\"[^a-zA-Z?.!,¿]+\", \" \", w)\n",
+    "\n",
+    "        w = w.strip()\n",
+    "\n",
+    "        # adding a start and an end token to the sentence\n",
+    "        # so that the model know when to start and stop predicting.\n",
+    "        w = \"<start> \" + w + \" <end>\"\n",
+    "        return w\n",
+    "\n",
+    "    def create_dataset(self, path, num_examples):\n",
+    "        # path : path to spa-eng.txt file\n",
+    "        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)\n",
+    "        lines = io.open(path, encoding=\"UTF-8\").read().strip().split(\"\\n\")\n",
+    "        word_pairs = [\n",
+    "            [self.preprocess_sentence(w) for w in l.split(\"\\t\")]\n",
+    "            for l in lines[:num_examples]\n",
+    "        ]\n",
+    "\n",
+    "        return zip(*word_pairs)\n",
+    "\n",
+    "    # Step 3 and Step 4\n",
+    "    def tokenize(self, lang):\n",
+    "        # lang = list of sentences in a language\n",
+    "\n",
+    "        # print(len(lang), \"example sentence: {}\".format(lang[0]))\n",
+    "        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(\n",
+    "            filters=\"\", oov_token=\"<OOV>\"\n",
+    "        )\n",
+    "        lang_tokenizer.fit_on_texts(lang)\n",
+    "\n",
+    "        ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn)\n",
+    "        ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)\n",
+    "        tensor = lang_tokenizer.texts_to_sequences(lang)\n",
+    "\n",
+    "        ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences\n",
+    "        ## and pads the sequences to match the longest sequences in the given input\n",
+    "        tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding=\"post\")\n",
+    "\n",
+    "        return tensor, lang_tokenizer\n",
+    "\n",
+    "    def load_dataset(self, path, num_examples=None):\n",
+    "        # creating cleaned input, output pairs\n",
+    "        targ_lang, inp_lang = self.create_dataset(path, num_examples)\n",
+    "\n",
+    "        input_tensor, inp_lang_tokenizer = self.tokenize(inp_lang)\n",
+    "        target_tensor, targ_lang_tokenizer = self.tokenize(targ_lang)\n",
+    "\n",
+    "        return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer\n",
+    "\n",
+    "    def call(self, num_examples, BUFFER_SIZE, BATCH_SIZE):\n",
+    "        file_path = download_nmt()\n",
+    "        (\n",
+    "            input_tensor,\n",
+    "            target_tensor,\n",
+    "            self.inp_lang_tokenizer,\n",
+    "            self.targ_lang_tokenizer,\n",
+    "        ) = self.load_dataset(file_path, num_examples)\n",
+    "\n",
+    "        (\n",
+    "            input_tensor_train,\n",
+    "            input_tensor_val,\n",
+    "            target_tensor_train,\n",
+    "            target_tensor_val,\n",
+    "        ) = train_test_split(input_tensor, target_tensor, test_size=0.2)\n",
+    "\n",
+    "        train_dataset = tf.data.Dataset.from_tensor_slices(\n",
+    "            (input_tensor_train, target_tensor_train)\n",
+    "        )\n",
+    "        train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(\n",
+    "            BATCH_SIZE, drop_remainder=True\n",
+    "        )\n",
+    "\n",
+    "        val_dataset = tf.data.Dataset.from_tensor_slices(\n",
+    "            (input_tensor_val, target_tensor_val)\n",
+    "        )\n",
+    "        val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)\n",
+    "\n",
+    "        return (\n",
+    "            train_dataset,\n",
+    "            val_dataset,\n",
+    "            self.inp_lang_tokenizer,\n",
+    "            self.targ_lang_tokenizer,\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "EIW4NVBmJ25k"
+   },
+   "outputs": [],
+   "source": [
+    "BUFFER_SIZE = 32000\n",
+    "BATCH_SIZE = 64\n",
+    "# Let's limit the #training examples for faster training\n",
+    "num_examples = 30000\n",
+    "\n",
+    "dataset_creator = NMTDataset(\"en-spa\")\n",
+    "train_dataset, val_dataset, inp_lang, targ_lang = dataset_creator.call(\n",
+    "    num_examples, BUFFER_SIZE, BATCH_SIZE\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "w2lCTy4vKOkB"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 98,
-      "metadata": {
-        "id": "EbQpyYs13jF_"
-      },
-      "outputs": [],
-      "source": [
-        "def evaluate_sentence(sentence):\n",
-        "  sentence = dataset_creator.preprocess_sentence(sentence)\n",
-        "\n",
-        "  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]\n",
-        "  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],\n",
-        "                                                          maxlen=max_length_input,\n",
-        "                                                          padding='post')\n",
-        "  inputs = tf.convert_to_tensor(inputs)\n",
-        "  inference_batch_size = inputs.shape[0]\n",
-        "  result = ''\n",
-        "\n",
-        "  enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]\n",
-        "  enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)\n",
-        "\n",
-        "  dec_h = enc_h\n",
-        "  dec_c = enc_c\n",
-        "\n",
-        "  start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])\n",
-        "  end_token = targ_lang.word_index['<end>']\n",
-        "\n",
-        "  greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()\n",
-        "\n",
-        "  # Instantiate BasicDecoder object\n",
-        "  decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)\n",
-        "  # Setup Memory in decoder stack\n",
-        "  decoder.attention_mechanism.setup_memory(enc_out)\n",
-        "\n",
-        "  # set decoder_initial_state\n",
-        "  decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)\n",
-        "\n",
-        "\n",
-        "  ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder \n",
-        "  ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. \n",
-        "  ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function\n",
-        "\n",
-        "  decoder_embedding_matrix = decoder.embedding.variables[0]\n",
-        "  \n",
-        "  outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)\n",
-        "  return outputs.sample_id.numpy()\n",
-        "\n",
-        "def translate(sentence):\n",
-        "  result = evaluate_sentence(sentence)\n",
-        "  print(result)\n",
-        "  result = targ_lang.sequences_to_texts(result)\n",
-        "  print('Input: %s' % (sentence))\n",
-        "  print('Predicted translation: {}'.format(result))"
+     "data": {
+      "text/plain": [
+       "(TensorShape([64, 16]), TensorShape([64, 11]))"
       ]
-    },
+     },
+     "execution_count": 7,
+     "metadata": {
+      "tags": []
+     },
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "example_input_batch, example_target_batch = next(iter(train_dataset))\n",
+    "example_input_batch.shape, example_target_batch.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rgCLkfv5uO3d"
+   },
+   "source": [
+    "### Some important parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "TqHsArVZ3jFS"
+   },
+   "outputs": [],
+   "source": [
+    "vocab_inp_size = len(inp_lang.word_index) + 1\n",
+    "vocab_tar_size = len(targ_lang.word_index) + 1\n",
+    "max_length_input = example_input_batch.shape[1]\n",
+    "max_length_output = example_target_batch.shape[1]\n",
+    "\n",
+    "embedding_dim = 256\n",
+    "units = 1024\n",
+    "steps_per_epoch = num_examples // BATCH_SIZE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "g-yY9c6aIu1h"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "n250XbnjOaqP"
-      },
-      "source": [
-        "## Restore the latest checkpoint and test"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "max_length_spanish, max_length_english, vocab_size_spanish, vocab_size_english\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "UJpT9D5_OgP6"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f9499417390>"
-            ]
-          },
-          "execution_count": 20,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "# restoring the latest checkpoint in checkpoint_dir\n",
-        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
+     "data": {
+      "text/plain": [
+       "(16, 11, 9415, 4936)"
       ]
-    },
+     },
+     "execution_count": 9,
+     "metadata": {
+      "tags": []
+     },
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(\"max_length_english, max_length_spanish, vocab_size_english, vocab_size_spanish\")\n",
+    "max_length_input, max_length_output, vocab_inp_size, vocab_tar_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "nZ2rI24i3jFg"
+   },
+   "outputs": [],
+   "source": [
+    "#####\n",
+    "\n",
+    "\n",
+    "class Encoder(tf.keras.Model):\n",
+    "    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n",
+    "        super(Encoder, self).__init__()\n",
+    "        self.batch_sz = batch_sz\n",
+    "        self.enc_units = enc_units\n",
+    "        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
+    "\n",
+    "        ##-------- LSTM layer in Encoder ------- ##\n",
+    "        self.lstm_layer = tf.keras.layers.LSTM(\n",
+    "            self.enc_units,\n",
+    "            return_sequences=True,\n",
+    "            return_state=True,\n",
+    "            recurrent_initializer=\"glorot_uniform\",\n",
+    "        )\n",
+    "\n",
+    "    def call(self, x, hidden):\n",
+    "        x = self.embedding(x)\n",
+    "        output, h, c = self.lstm_layer(x, initial_state=hidden)\n",
+    "        return output, h, c\n",
+    "\n",
+    "    def initialize_hidden_state(self):\n",
+    "        return [\n",
+    "            tf.zeros((self.batch_sz, self.enc_units)),\n",
+    "            tf.zeros((self.batch_sz, self.enc_units)),\n",
+    "        ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "60gSVh05Jl6l"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 99,
-      "metadata": {
-        "id": "WYmYhNN_faR5"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[[ 11  12  49 224  40   4   3]]\n",
-            "Input: hace mucho frio aqui.\n",
-            "Predicted translation: ['it s very pretty here . <end>']\n"
-          ]
-        }
-      ],
-      "source": [
-        "translate(u'hace mucho frio aqui.')"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Encoder output shape: (batch size, sequence length, units) (64, 16, 1024)\n",
+      "Encoder h vecotr shape: (batch size, units) (64, 1024)\n",
+      "Encoder c vector shape: (batch size, units) (64, 1024)\n"
+     ]
+    }
+   ],
+   "source": [
+    "## Test Encoder Stack\n",
+    "\n",
+    "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)\n",
+    "\n",
+    "\n",
+    "# sample input\n",
+    "sample_hidden = encoder.initialize_hidden_state()\n",
+    "sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)\n",
+    "print(\n",
+    "    \"Encoder output shape: (batch size, sequence length, units) {}\".format(\n",
+    "        sample_output.shape\n",
+    "    )\n",
+    ")\n",
+    "print(\"Encoder h vecotr shape: (batch size, units) {}\".format(sample_h.shape))\n",
+    "print(\"Encoder c vector shape: (batch size, units) {}\".format(sample_c.shape))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "yJ_B3mhW3jFk"
+   },
+   "outputs": [],
+   "source": [
+    "class Decoder(tf.keras.Model):\n",
+    "    def __init__(\n",
+    "        self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type=\"luong\"\n",
+    "    ):\n",
+    "        super(Decoder, self).__init__()\n",
+    "        self.batch_sz = batch_sz\n",
+    "        self.dec_units = dec_units\n",
+    "        self.attention_type = attention_type\n",
+    "\n",
+    "        # Embedding Layer\n",
+    "        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
+    "\n",
+    "        # Final Dense layer on which softmax will be applied\n",
+    "        self.fc = tf.keras.layers.Dense(vocab_size)\n",
+    "\n",
+    "        # Define the fundamental cell for decoder recurrent structure\n",
+    "        self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)\n",
+    "\n",
+    "        # Sampler\n",
+    "        self.sampler = tfa.seq2seq.sampler.TrainingSampler()\n",
+    "\n",
+    "        # Create attention mechanism with memory = None\n",
+    "        self.attention_mechanism = self.build_attention_mechanism(\n",
+    "            self.dec_units,\n",
+    "            None,\n",
+    "            self.batch_sz * [max_length_input],\n",
+    "            self.attention_type,\n",
+    "        )\n",
+    "\n",
+    "        # Wrap attention mechanism with the fundamental rnn cell of decoder\n",
+    "        self.rnn_cell = self.build_rnn_cell(batch_sz)\n",
+    "\n",
+    "        # Define the decoder with respect to fundamental rnn cell\n",
+    "        self.decoder = tfa.seq2seq.BasicDecoder(\n",
+    "            self.rnn_cell, sampler=self.sampler, output_layer=self.fc\n",
+    "        )\n",
+    "\n",
+    "    def build_rnn_cell(self, batch_sz):\n",
+    "        rnn_cell = tfa.seq2seq.AttentionWrapper(\n",
+    "            self.decoder_rnn_cell,\n",
+    "            self.attention_mechanism,\n",
+    "            attention_layer_size=self.dec_units,\n",
+    "        )\n",
+    "        return rnn_cell\n",
+    "\n",
+    "    def build_attention_mechanism(\n",
+    "        self, dec_units, memory, memory_sequence_length, attention_type=\"luong\"\n",
+    "    ):\n",
+    "        # ------------- #\n",
+    "        # typ: Which sort of attention (Bahdanau, Luong)\n",
+    "        # dec_units: final dimension of attention outputs\n",
+    "        # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)\n",
+    "        # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)\n",
+    "\n",
+    "        if attention_type == \"bahdanau\":\n",
+    "            return tfa.seq2seq.BahdanauAttention(\n",
+    "                units=dec_units,\n",
+    "                memory=memory,\n",
+    "                memory_sequence_length=memory_sequence_length,\n",
+    "            )\n",
+    "        else:\n",
+    "            return tfa.seq2seq.LuongAttention(\n",
+    "                units=dec_units,\n",
+    "                memory=memory,\n",
+    "                memory_sequence_length=memory_sequence_length,\n",
+    "            )\n",
+    "\n",
+    "    def build_initial_state(self, batch_sz, encoder_state, Dtype):\n",
+    "        decoder_initial_state = self.rnn_cell.get_initial_state(\n",
+    "            batch_size=batch_sz, dtype=Dtype\n",
+    "        )\n",
+    "        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)\n",
+    "        return decoder_initial_state\n",
+    "\n",
+    "    def call(self, inputs, initial_state):\n",
+    "        x = self.embedding(inputs)\n",
+    "        outputs, _, _ = self.decoder(\n",
+    "            x,\n",
+    "            initial_state=initial_state,\n",
+    "            sequence_length=self.batch_sz * [max_length_output - 1],\n",
+    "        )\n",
+    "        return outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "DaiO0Z6_Ml1c"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 100,
-      "metadata": {
-        "id": "zSx2iM36EZQZ"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[[ 20   9  22 190   4   3]]\n",
-            "Input: esta es mi vida.\n",
-            "Predicted translation: ['this is my life . <end>']\n"
-          ]
-        }
-      ],
-      "source": [
-        "translate(u'esta es mi vida.')"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Decoder Outputs Shape:  (64, 10, 4936)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Test decoder stack\n",
+    "\n",
+    "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, \"luong\")\n",
+    "sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))\n",
+    "decoder.attention_mechanism.setup_memory(sample_output)\n",
+    "initial_state = decoder.build_initial_state(\n",
+    "    BATCH_SIZE, [sample_h, sample_c], tf.float32\n",
+    ")\n",
+    "\n",
+    "\n",
+    "sample_decoder_outputs = decoder(sample_x, initial_state)\n",
+    "\n",
+    "print(\"Decoder Outputs Shape: \", sample_decoder_outputs.rnn_output.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "_ch_71VbIRfK"
+   },
+   "source": [
+    "## Define the optimizer and the loss function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "WmTHr5iV3jFr"
+   },
+   "outputs": [],
+   "source": [
+    "optimizer = tf.keras.optimizers.Adam()\n",
+    "\n",
+    "\n",
+    "def loss_function(real, pred):\n",
+    "    # real shape = (BATCH_SIZE, max_length_output)\n",
+    "    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )\n",
+    "    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(\n",
+    "        from_logits=True, reduction=\"none\"\n",
+    "    )\n",
+    "    loss = cross_entropy(y_true=real, y_pred=pred)\n",
+    "    mask = tf.logical_not(tf.math.equal(real, 0))  # output 0 for y=0 else output 1\n",
+    "    mask = tf.cast(mask, dtype=loss.dtype)\n",
+    "    loss = mask * loss\n",
+    "    loss = tf.reduce_mean(loss)\n",
+    "    return loss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "DMVWzzsfNl4e"
+   },
+   "source": [
+    "## Checkpoints (Object-based saving)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Zj8bXQTgNwrF"
+   },
+   "outputs": [],
+   "source": [
+    "checkpoint_dir = \"./training_checkpoints\"\n",
+    "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
+    "checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8Bw95utNiFHa"
+   },
+   "source": [
+    "## One train_step operations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "sC9ArXSsVfqn"
+   },
+   "outputs": [],
+   "source": [
+    "@tf.function\n",
+    "def train_step(inp, targ, enc_hidden):\n",
+    "    loss = 0\n",
+    "\n",
+    "    with tf.GradientTape() as tape:\n",
+    "        enc_output, enc_h, enc_c = encoder(inp, enc_hidden)\n",
+    "\n",
+    "        dec_input = targ[:, :-1]  # Ignore <end> token\n",
+    "        real = targ[:, 1:]  # ignore <start> token\n",
+    "\n",
+    "        # Set the AttentionMechanism object with encoder_outputs\n",
+    "        decoder.attention_mechanism.setup_memory(enc_output)\n",
+    "\n",
+    "        # Create AttentionWrapperState as initial_state for decoder\n",
+    "        decoder_initial_state = decoder.build_initial_state(\n",
+    "            BATCH_SIZE, [enc_h, enc_c], tf.float32\n",
+    "        )\n",
+    "        pred = decoder(dec_input, decoder_initial_state)\n",
+    "        logits = pred.rnn_output\n",
+    "        loss = loss_function(real, logits)\n",
+    "\n",
+    "    variables = encoder.trainable_variables + decoder.trainable_variables\n",
+    "    gradients = tape.gradient(loss, variables)\n",
+    "    optimizer.apply_gradients(zip(gradients, variables))\n",
+    "\n",
+    "    return loss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pey8eb9piMMg"
+   },
+   "source": [
+    "## Train the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ddefjBMa3jF0"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "A3LLCx3ZE0Ls"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[[25  7 90  8  3]]\n",
-            "Input: ¿todavia estan en casa?\n",
-            "Predicted translation: ['are you home ? <end>']\n"
-          ]
-        }
-      ],
-      "source": [
-        "translate(u'¿todavia estan en casa?')"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1 Batch 0 Loss 5.1692\n",
+      "Epoch 1 Batch 100 Loss 2.2288\n",
+      "Epoch 1 Batch 200 Loss 1.9930\n",
+      "Epoch 1 Batch 300 Loss 1.7783\n",
+      "Epoch 1 Loss 1.6975\n",
+      "Time taken for 1 epoch 37.26002788543701 sec\n",
+      "\n",
+      "Epoch 2 Batch 0 Loss 1.6408\n",
+      "Epoch 2 Batch 100 Loss 1.5767\n",
+      "Epoch 2 Batch 200 Loss 1.4054\n",
+      "Epoch 2 Batch 300 Loss 1.3755\n",
+      "Epoch 2 Loss 1.1412\n",
+      "Time taken for 1 epoch 30.0094051361084 sec\n",
+      "\n",
+      "Epoch 3 Batch 0 Loss 1.0296\n",
+      "Epoch 3 Batch 100 Loss 1.0306\n",
+      "Epoch 3 Batch 200 Loss 1.0675\n",
+      "Epoch 3 Batch 300 Loss 0.9574\n",
+      "Epoch 3 Loss 0.8037\n",
+      "Time taken for 1 epoch 28.983767986297607 sec\n",
+      "\n",
+      "Epoch 4 Batch 0 Loss 0.5923\n",
+      "Epoch 4 Batch 100 Loss 0.7533\n",
+      "Epoch 4 Batch 200 Loss 0.7397\n",
+      "Epoch 4 Batch 300 Loss 0.6779\n",
+      "Epoch 4 Loss 0.5419\n",
+      "Time taken for 1 epoch 29.649972200393677 sec\n",
+      "\n",
+      "Epoch 5 Batch 0 Loss 0.4320\n",
+      "Epoch 5 Batch 100 Loss 0.4349\n",
+      "Epoch 5 Batch 200 Loss 0.4686\n",
+      "Epoch 5 Batch 300 Loss 0.4748\n",
+      "Epoch 5 Loss 0.3827\n",
+      "Time taken for 1 epoch 29.06334638595581 sec\n",
+      "\n",
+      "Epoch 6 Batch 0 Loss 0.3422\n",
+      "Epoch 6 Batch 100 Loss 0.3052\n",
+      "Epoch 6 Batch 200 Loss 0.3288\n",
+      "Epoch 6 Batch 300 Loss 0.3216\n",
+      "Epoch 6 Loss 0.2814\n",
+      "Time taken for 1 epoch 29.57170796394348 sec\n",
+      "\n",
+      "Epoch 7 Batch 0 Loss 0.2129\n",
+      "Epoch 7 Batch 100 Loss 0.2382\n",
+      "Epoch 7 Batch 200 Loss 0.2406\n",
+      "Epoch 7 Batch 300 Loss 0.2792\n",
+      "Epoch 7 Loss 0.2162\n",
+      "Time taken for 1 epoch 28.95500087738037 sec\n",
+      "\n",
+      "Epoch 8 Batch 0 Loss 0.2073\n",
+      "Epoch 8 Batch 100 Loss 0.2095\n",
+      "Epoch 8 Batch 200 Loss 0.1962\n",
+      "Epoch 8 Batch 300 Loss 0.1879\n",
+      "Epoch 8 Loss 0.1794\n",
+      "Time taken for 1 epoch 29.70877432823181 sec\n",
+      "\n",
+      "Epoch 9 Batch 0 Loss 0.1517\n",
+      "Epoch 9 Batch 100 Loss 0.2231\n",
+      "Epoch 9 Batch 200 Loss 0.2203\n",
+      "Epoch 9 Batch 300 Loss 0.2282\n",
+      "Epoch 9 Loss 0.1496\n",
+      "Time taken for 1 epoch 29.20821261405945 sec\n",
+      "\n",
+      "Epoch 10 Batch 0 Loss 0.1204\n",
+      "Epoch 10 Batch 100 Loss 0.1370\n",
+      "Epoch 10 Batch 200 Loss 0.1778\n",
+      "Epoch 10 Batch 300 Loss 0.2069\n",
+      "Epoch 10 Loss 0.1316\n",
+      "Time taken for 1 epoch 29.576894283294678 sec\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "EPOCHS = 10\n",
+    "\n",
+    "for epoch in range(EPOCHS):\n",
+    "    start = time.time()\n",
+    "\n",
+    "    enc_hidden = encoder.initialize_hidden_state()\n",
+    "    total_loss = 0\n",
+    "    # print(enc_hidden[0].shape, enc_hidden[1].shape)\n",
+    "\n",
+    "    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):\n",
+    "        batch_loss = train_step(inp, targ, enc_hidden)\n",
+    "        total_loss += batch_loss\n",
+    "\n",
+    "        if batch % 100 == 0:\n",
+    "            print(\n",
+    "                \"Epoch {} Batch {} Loss {:.4f}\".format(\n",
+    "                    epoch + 1, batch, batch_loss.numpy()\n",
+    "                )\n",
+    "            )\n",
+    "    # saving (checkpoint) the model every 2 epochs\n",
+    "    if (epoch + 1) % 2 == 0:\n",
+    "        checkpoint.save(file_prefix=checkpoint_prefix)\n",
+    "\n",
+    "    print(\"Epoch {} Loss {:.4f}\".format(epoch + 1, total_loss / steps_per_epoch))\n",
+    "    print(\"Time taken for 1 epoch {} sec\\n\".format(time.time() - start))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mU3Ce8M6I3rz"
+   },
+   "source": [
+    "## Use tf-addons BasicDecoder for decoding\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {
+    "id": "EbQpyYs13jF_"
+   },
+   "outputs": [],
+   "source": [
+    "def evaluate_sentence(sentence):\n",
+    "    sentence = dataset_creator.preprocess_sentence(sentence)\n",
+    "\n",
+    "    inputs = [inp_lang.word_index[i] for i in sentence.split(\" \")]\n",
+    "    inputs = tf.keras.preprocessing.sequence.pad_sequences(\n",
+    "        [inputs], maxlen=max_length_input, padding=\"post\"\n",
+    "    )\n",
+    "    inputs = tf.convert_to_tensor(inputs)\n",
+    "    inference_batch_size = inputs.shape[0]\n",
+    "    result = \"\"\n",
+    "\n",
+    "    enc_start_state = [\n",
+    "        tf.zeros((inference_batch_size, units)),\n",
+    "        tf.zeros((inference_batch_size, units)),\n",
+    "    ]\n",
+    "    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)\n",
+    "\n",
+    "    dec_h = enc_h\n",
+    "    dec_c = enc_c\n",
+    "\n",
+    "    start_tokens = tf.fill([inference_batch_size], targ_lang.word_index[\"<start>\"])\n",
+    "    end_token = targ_lang.word_index[\"<end>\"]\n",
+    "\n",
+    "    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()\n",
+    "\n",
+    "    # Instantiate BasicDecoder object\n",
+    "    decoder_instance = tfa.seq2seq.BasicDecoder(\n",
+    "        cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc\n",
+    "    )\n",
+    "    # Setup Memory in decoder stack\n",
+    "    decoder.attention_mechanism.setup_memory(enc_out)\n",
+    "\n",
+    "    # set decoder_initial_state\n",
+    "    decoder_initial_state = decoder.build_initial_state(\n",
+    "        inference_batch_size, [enc_h, enc_c], tf.float32\n",
+    "    )\n",
+    "\n",
+    "    ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder\n",
+    "    ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this.\n",
+    "    ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function\n",
+    "\n",
+    "    decoder_embedding_matrix = decoder.embedding.variables[0]\n",
+    "\n",
+    "    outputs, _, _ = decoder_instance(\n",
+    "        decoder_embedding_matrix,\n",
+    "        start_tokens=start_tokens,\n",
+    "        end_token=end_token,\n",
+    "        initial_state=decoder_initial_state,\n",
+    "    )\n",
+    "    return outputs.sample_id.numpy()\n",
+    "\n",
+    "\n",
+    "def translate(sentence):\n",
+    "    result = evaluate_sentence(sentence)\n",
+    "    print(result)\n",
+    "    result = targ_lang.sequences_to_texts(result)\n",
+    "    print(\"Input: %s\" % (sentence))\n",
+    "    print(\"Predicted translation: {}\".format(result))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "n250XbnjOaqP"
+   },
+   "source": [
+    "## Restore the latest checkpoint and test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "UJpT9D5_OgP6"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "DUQVLVqUE1YW"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[[126  16 892  11  75   4   3]]\n",
-            "Input: trata de averiguarlo.\n",
-            "Predicted translation: ['try to figure it out . <end>']\n"
-          ]
-        }
-      ],
-      "source": [
-        "# wrong translation\n",
-        "translate(u'trata de averiguarlo.')"
+     "data": {
+      "text/plain": [
+       "<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f9499417390>"
       ]
-    },
+     },
+     "execution_count": 20,
+     "metadata": {
+      "tags": []
+     },
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# restoring the latest checkpoint in checkpoint_dir\n",
+    "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {
+    "id": "WYmYhNN_faR5"
+   },
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "IRUuNDeY0HiC"
-      },
-      "source": [
-        "## Use tf-addons BeamSearchDecoder \n"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 11  12  49 224  40   4   3]]\n",
+      "Input: hace mucho frio aqui.\n",
+      "Predicted translation: ['it s very pretty here . <end>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "translate(\"hace mucho frio aqui.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {
+    "id": "zSx2iM36EZQZ"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 89,
-      "metadata": {
-        "id": "AJ-RTQ0hsJNL"
-      },
-      "outputs": [],
-      "source": [
-        "def beam_evaluate_sentence(sentence, beam_width=3):\n",
-        "  sentence = dataset_creator.preprocess_sentence(sentence)\n",
-        "\n",
-        "  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]\n",
-        "  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],\n",
-        "                                                          maxlen=max_length_input,\n",
-        "                                                          padding='post')\n",
-        "  inputs = tf.convert_to_tensor(inputs)\n",
-        "  inference_batch_size = inputs.shape[0]\n",
-        "  result = ''\n",
-        "\n",
-        "  enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]\n",
-        "  enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)\n",
-        "\n",
-        "  dec_h = enc_h\n",
-        "  dec_c = enc_c\n",
-        "\n",
-        "  start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])\n",
-        "  end_token = targ_lang.word_index['<end>']\n",
-        "\n",
-        "  # From official documentation\n",
-        "  # NOTE If you are using the BeamSearchDecoder with a cell wrapped in AttentionWrapper, then you must ensure that:\n",
-        "  # The encoder output has been tiled to beam_width via tfa.seq2seq.tile_batch (NOT tf.tile).\n",
-        "  # The batch_size argument passed to the get_initial_state method of this wrapper is equal to true_batch_size * beam_width.\n",
-        "  # The initial state created with get_initial_state above contains a cell_state value containing properly tiled final state from the encoder.\n",
-        "\n",
-        "  enc_out = tfa.seq2seq.tile_batch(enc_out, multiplier=beam_width)\n",
-        "  decoder.attention_mechanism.setup_memory(enc_out)\n",
-        "  print(\"beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] :\", enc_out.shape)\n",
-        "\n",
-        "  # set decoder_inital_state which is an AttentionWrapperState considering beam_width\n",
-        "  hidden_state = tfa.seq2seq.tile_batch([enc_h, enc_c], multiplier=beam_width)\n",
-        "  decoder_initial_state = decoder.rnn_cell.get_initial_state(batch_size=beam_width*inference_batch_size, dtype=tf.float32)\n",
-        "  decoder_initial_state = decoder_initial_state.clone(cell_state=hidden_state)\n",
-        "\n",
-        "  # Instantiate BeamSearchDecoder\n",
-        "  decoder_instance = tfa.seq2seq.BeamSearchDecoder(decoder.rnn_cell,beam_width=beam_width, output_layer=decoder.fc)\n",
-        "  decoder_embedding_matrix = decoder.embedding.variables[0]\n",
-        "\n",
-        "  # The BeamSearchDecoder object's call() function takes care of everything.\n",
-        "  outputs, final_state, sequence_lengths = decoder_instance(decoder_embedding_matrix, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state)\n",
-        "  # outputs is tfa.seq2seq.FinalBeamSearchDecoderOutput object. \n",
-        "  # The final beam predictions are stored in outputs.predicted_id\n",
-        "  # outputs.beam_search_decoder_output is a tfa.seq2seq.BeamSearchDecoderOutput object which keep tracks of beam_scores and parent_ids while performing a beam decoding step\n",
-        "  # final_state = tfa.seq2seq.BeamSearchDecoderState object.\n",
-        "  # Sequence Length = [inference_batch_size, beam_width] details the maximum length of the beams that are generated\n",
-        "\n",
-        "  \n",
-        "  # outputs.predicted_id.shape = (inference_batch_size, time_step_outputs, beam_width)\n",
-        "  # outputs.beam_search_decoder_output.scores.shape = (inference_batch_size, time_step_outputs, beam_width)\n",
-        "  # Convert the shape of outputs and beam_scores to (inference_batch_size, beam_width, time_step_outputs)\n",
-        "  final_outputs = tf.transpose(outputs.predicted_ids, perm=(0,2,1))\n",
-        "  beam_scores = tf.transpose(outputs.beam_search_decoder_output.scores, perm=(0,2,1))\n",
-        "  \n",
-        "  return final_outputs.numpy(), beam_scores.numpy()"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 20   9  22 190   4   3]]\n",
+      "Input: esta es mi vida.\n",
+      "Predicted translation: ['this is my life . <end>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "translate(\"esta es mi vida.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "A3LLCx3ZE0Ls"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 90,
-      "metadata": {
-        "id": "g_LvXGvX8X-O"
-      },
-      "outputs": [],
-      "source": [
-        "def beam_translate(sentence):\n",
-        "  result, beam_scores = beam_evaluate_sentence(sentence)\n",
-        "  print(result.shape, beam_scores.shape)\n",
-        "  for beam, score in zip(result, beam_scores):\n",
-        "    print(beam.shape, score.shape)\n",
-        "    output = targ_lang.sequences_to_texts(beam)\n",
-        "    output = [a[:a.index('<end>')] for a in output]\n",
-        "    beam_score = [a.sum() for a in score]\n",
-        "    print('Input: %s' % (sentence))\n",
-        "    for i in range(len(output)):\n",
-        "      print('{} Predicted translation: {}  {}'.format(i+1, output[i], beam_score[i]))\n"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[25  7 90  8  3]]\n",
+      "Input: ¿todavia estan en casa?\n",
+      "Predicted translation: ['are you home ? <end>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "translate(\"¿todavia estan en casa?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "DUQVLVqUE1YW"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 91,
-      "metadata": {
-        "id": "TODnXBleDzzO"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] : (3, 16, 1024)\n",
-            "(1, 3, 7) (1, 3, 7)\n",
-            "(3, 7) (3, 7)\n",
-            "Input: hace mucho frio aqui.\n",
-            "1 Predicted translation: it s very pretty here .   -4.117094039916992\n",
-            "2 Predicted translation: it s very cold here .   -14.85302734375\n",
-            "3 Predicted translation: it s very pretty news .   -25.59416389465332\n"
-          ]
-        }
-      ],
-      "source": [
-        "beam_translate(u'hace mucho frio aqui.')"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[126  16 892  11  75   4   3]]\n",
+      "Input: trata de averiguarlo.\n",
+      "Predicted translation: ['try to figure it out . <end>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# wrong translation\n",
+    "translate(\"trata de averiguarlo.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "IRUuNDeY0HiC"
+   },
+   "source": [
+    "## Use tf-addons BeamSearchDecoder \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {
+    "id": "AJ-RTQ0hsJNL"
+   },
+   "outputs": [],
+   "source": [
+    "def beam_evaluate_sentence(sentence, beam_width=3):\n",
+    "    sentence = dataset_creator.preprocess_sentence(sentence)\n",
+    "\n",
+    "    inputs = [inp_lang.word_index[i] for i in sentence.split(\" \")]\n",
+    "    inputs = tf.keras.preprocessing.sequence.pad_sequences(\n",
+    "        [inputs], maxlen=max_length_input, padding=\"post\"\n",
+    "    )\n",
+    "    inputs = tf.convert_to_tensor(inputs)\n",
+    "    inference_batch_size = inputs.shape[0]\n",
+    "    result = \"\"\n",
+    "\n",
+    "    enc_start_state = [\n",
+    "        tf.zeros((inference_batch_size, units)),\n",
+    "        tf.zeros((inference_batch_size, units)),\n",
+    "    ]\n",
+    "    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)\n",
+    "\n",
+    "    dec_h = enc_h\n",
+    "    dec_c = enc_c\n",
+    "\n",
+    "    start_tokens = tf.fill([inference_batch_size], targ_lang.word_index[\"<start>\"])\n",
+    "    end_token = targ_lang.word_index[\"<end>\"]\n",
+    "\n",
+    "    # From official documentation\n",
+    "    # NOTE If you are using the BeamSearchDecoder with a cell wrapped in AttentionWrapper, then you must ensure that:\n",
+    "    # The encoder output has been tiled to beam_width via tfa.seq2seq.tile_batch (NOT tf.tile).\n",
+    "    # The batch_size argument passed to the get_initial_state method of this wrapper is equal to true_batch_size * beam_width.\n",
+    "    # The initial state created with get_initial_state above contains a cell_state value containing properly tiled final state from the encoder.\n",
+    "\n",
+    "    enc_out = tfa.seq2seq.tile_batch(enc_out, multiplier=beam_width)\n",
+    "    decoder.attention_mechanism.setup_memory(enc_out)\n",
+    "    print(\n",
+    "        \"beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] :\",\n",
+    "        enc_out.shape,\n",
+    "    )\n",
+    "\n",
+    "    # set decoder_inital_state which is an AttentionWrapperState considering beam_width\n",
+    "    hidden_state = tfa.seq2seq.tile_batch([enc_h, enc_c], multiplier=beam_width)\n",
+    "    decoder_initial_state = decoder.rnn_cell.get_initial_state(\n",
+    "        batch_size=beam_width * inference_batch_size, dtype=tf.float32\n",
+    "    )\n",
+    "    decoder_initial_state = decoder_initial_state.clone(cell_state=hidden_state)\n",
+    "\n",
+    "    # Instantiate BeamSearchDecoder\n",
+    "    decoder_instance = tfa.seq2seq.BeamSearchDecoder(\n",
+    "        decoder.rnn_cell, beam_width=beam_width, output_layer=decoder.fc\n",
+    "    )\n",
+    "    decoder_embedding_matrix = decoder.embedding.variables[0]\n",
+    "\n",
+    "    # The BeamSearchDecoder object's call() function takes care of everything.\n",
+    "    outputs, final_state, sequence_lengths = decoder_instance(\n",
+    "        decoder_embedding_matrix,\n",
+    "        start_tokens=start_tokens,\n",
+    "        end_token=end_token,\n",
+    "        initial_state=decoder_initial_state,\n",
+    "    )\n",
+    "    # outputs is tfa.seq2seq.FinalBeamSearchDecoderOutput object.\n",
+    "    # The final beam predictions are stored in outputs.predicted_id\n",
+    "    # outputs.beam_search_decoder_output is a tfa.seq2seq.BeamSearchDecoderOutput object which keep tracks of beam_scores and parent_ids while performing a beam decoding step\n",
+    "    # final_state = tfa.seq2seq.BeamSearchDecoderState object.\n",
+    "    # Sequence Length = [inference_batch_size, beam_width] details the maximum length of the beams that are generated\n",
+    "\n",
+    "    # outputs.predicted_id.shape = (inference_batch_size, time_step_outputs, beam_width)\n",
+    "    # outputs.beam_search_decoder_output.scores.shape = (inference_batch_size, time_step_outputs, beam_width)\n",
+    "    # Convert the shape of outputs and beam_scores to (inference_batch_size, beam_width, time_step_outputs)\n",
+    "    final_outputs = tf.transpose(outputs.predicted_ids, perm=(0, 2, 1))\n",
+    "    beam_scores = tf.transpose(\n",
+    "        outputs.beam_search_decoder_output.scores, perm=(0, 2, 1)\n",
+    "    )\n",
+    "\n",
+    "    return final_outputs.numpy(), beam_scores.numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {
+    "id": "g_LvXGvX8X-O"
+   },
+   "outputs": [],
+   "source": [
+    "def beam_translate(sentence):\n",
+    "    result, beam_scores = beam_evaluate_sentence(sentence)\n",
+    "    print(result.shape, beam_scores.shape)\n",
+    "    for beam, score in zip(result, beam_scores):\n",
+    "        print(beam.shape, score.shape)\n",
+    "        output = targ_lang.sequences_to_texts(beam)\n",
+    "        output = [a[: a.index(\"<end>\")] for a in output]\n",
+    "        beam_score = [a.sum() for a in score]\n",
+    "        print(\"Input: %s\" % (sentence))\n",
+    "        for i in range(len(output)):\n",
+    "            print(\n",
+    "                \"{} Predicted translation: {}  {}\".format(\n",
+    "                    i + 1, output[i], beam_score[i]\n",
+    "                )\n",
+    "            )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {
+    "id": "TODnXBleDzzO"
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "_BezQwENFY3L"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] : (3, 16, 1024)\n",
-            "(1, 3, 7) (1, 3, 7)\n",
-            "(3, 7) (3, 7)\n",
-            "Input: ¿todavia estan en casa?\n",
-            "1 Predicted translation: are you still home ?   -4.036754131317139\n",
-            "2 Predicted translation: are you still at home ?   -15.306867599487305\n",
-            "3 Predicted translation: are you still go home ?   -20.533388137817383\n"
-          ]
-        }
-      ],
-      "source": [
-        "beam_translate(u'¿todavia estan en casa?')"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] : (3, 16, 1024)\n",
+      "(1, 3, 7) (1, 3, 7)\n",
+      "(3, 7) (3, 7)\n",
+      "Input: hace mucho frio aqui.\n",
+      "1 Predicted translation: it s very pretty here .   -4.117094039916992\n",
+      "2 Predicted translation: it s very cold here .   -14.85302734375\n",
+      "3 Predicted translation: it s very pretty news .   -25.59416389465332\n"
+     ]
     }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "networks_seq2seq_nmt.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
+   ],
+   "source": [
+    "beam_translate(\"hace mucho frio aqui.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_BezQwENFY3L"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "beam_with * [batch_size, max_length_input, rnn_units] :  3 * [1, 16, 1024]] : (3, 16, 1024)\n",
+      "(1, 3, 7) (1, 3, 7)\n",
+      "(3, 7) (3, 7)\n",
+      "Input: ¿todavia estan en casa?\n",
+      "1 Predicted translation: are you still home ?   -4.036754131317139\n",
+      "2 Predicted translation: are you still at home ?   -15.306867599487305\n",
+      "3 Predicted translation: are you still go home ?   -20.533388137817383\n"
+     ]
     }
+   ],
+   "source": [
+    "beam_translate(\"¿todavia estan en casa?\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "networks_seq2seq_nmt.ipynb",
+   "toc_visible": true
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/optimizers_conditionalgradient.ipynb b/docs/tutorials/optimizers_conditionalgradient.ipynb
index 8ac4c1f6d8..f1d246cd0a 100644
--- a/docs/tutorials/optimizers_conditionalgradient.ipynb
+++ b/docs/tutorials/optimizers_conditionalgradient.ipynb
@@ -1,399 +1,410 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pGUYKbJNWNgj"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "1PzPJglSWgnW"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "b5P4BEg1XYd5"
-      },
-      "source": [
-        "# TensorFlow Addons Optimizers: ConditionalGradient\n",
-        "\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/optimizers_conditionalgradient\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/optimizers_conditionalgradient.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/optimizers_conditionalgradient.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/optimizers_conditionalgradient.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Faj8luWnYNSG"
-      },
-      "source": [
-        "# Overview\n",
-        "This notebook will demonstrate how to use the Conditional Graident Optimizer from the Addons package."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MrDjqjY6YRYM"
-      },
-      "source": [
-        "# ConditionalGradient\n",
-        "\n",
-        "\n",
-        "> Constraining the parameters of a neural network has been shown to be beneficial in training because of the underlying regularization effects.  Often, parameters are constrained via a soft penalty (which never guarantees the constraint satisfaction) or via a projection operation (which is computationally expensive). Conditional gradient (CG) optimizer, on the other hand, enforces the constraints strictly without the need for an expensive projection step. It works by minimizing a linear approximation of the objective within the constraint set. In this notebook, you demonstrate the appliction of Frobenius norm constraint via the CG optimizer on the MNIST dataset. CG is now available as a tensorflow API. More details of the optimizer are available at https://arxiv.org/pdf/1803.06453.pdf\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dooBaYGLYYnn"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "2sCyoNXlgGbk"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -U tensorflow-addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "qYo0FkL4O7io"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "import tensorflow_addons as tfa\n",
-        "from matplotlib import pyplot as plt"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "kR0PnjrIirpJ"
-      },
-      "outputs": [],
-      "source": [
-        "# Hyperparameters\n",
-        "batch_size=64\n",
-        "epochs=10"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-x0WBp-IYz7x"
-      },
-      "source": [
-        "# Build the Model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "4KzMDUT0i1QE"
-      },
-      "outputs": [],
-      "source": [
-        "model_1 = tf.keras.Sequential([\n",
-        "    tf.keras.layers.Dense(64, input_shape=(784,), activation='relu', name='dense_1'),\n",
-        "    tf.keras.layers.Dense(64, activation='relu', name='dense_2'),\n",
-        "    tf.keras.layers.Dense(10, activation='softmax', name='predictions'),\n",
-        "])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XGADNG3-Y7aa"
-      },
-      "source": [
-        "# Prep the Data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "d6a-kbM_i1b2"
-      },
-      "outputs": [],
-      "source": [
-        "# Load MNIST dataset as NumPy arrays\n",
-        "dataset = {}\n",
-        "num_validation = 10000\n",
-        "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()\n",
-        "\n",
-        "# Preprocess the data\n",
-        "x_train = x_train.reshape(-1, 784).astype('float32') / 255\n",
-        "x_test = x_test.reshape(-1, 784).astype('float32') / 255"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "sOlB-WqjZp1Y"
-      },
-      "source": [
-        "# Define a Custom Callback Function"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "8LCmRXUgZqyV"
-      },
-      "outputs": [],
-      "source": [
-        "def frobenius_norm(m):\n",
-        "    \"\"\"This function is to calculate the frobenius norm of the matrix of all\n",
-        "    layer's weight.\n",
-        "  \n",
-        "    Args:\n",
-        "        m: is a list of weights param for each layers.\n",
-        "    \"\"\"\n",
-        "    total_reduce_sum = 0\n",
-        "    for i in range(len(m)):\n",
-        "        total_reduce_sum = total_reduce_sum + tf.math.reduce_sum(m[i]**2)\n",
-        "    norm = total_reduce_sum**0.5\n",
-        "    return norm"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "udSvzKm4Z5Zr"
-      },
-      "outputs": [],
-      "source": [
-        "CG_frobenius_norm_of_weight = []\n",
-        "CG_get_weight_norm = tf.keras.callbacks.LambdaCallback(\n",
-        "    on_epoch_end=lambda batch, logs: CG_frobenius_norm_of_weight.append(\n",
-        "        frobenius_norm(model_1.trainable_weights).numpy()))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qfhE1DfwZC1i"
-      },
-      "source": [
-        "# Train and Evaluate: Using CG as Optimizer\n",
-        "\n",
-        "Simply replace typical keras optimizers with the new tfa optimizer "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "6-AMaOYEi1kK"
-      },
-      "outputs": [],
-      "source": [
-        "# Compile the model\n",
-        "model_1.compile(\n",
-        "    optimizer=tfa.optimizers.ConditionalGradient(\n",
-        "        learning_rate=0.99949, lambda_=203),  # Utilize TFA optimizer\n",
-        "    loss=tf.keras.losses.SparseCategoricalCrossentropy(),\n",
-        "    metrics=['accuracy'])\n",
-        "\n",
-        "history_cg = model_1.fit(\n",
-        "    x_train,\n",
-        "    y_train,\n",
-        "    batch_size=batch_size,\n",
-        "    validation_data=(x_test, y_test),\n",
-        "    epochs=epochs,\n",
-        "    callbacks=[CG_get_weight_norm])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8OJp4So9bYYR"
-      },
-      "source": [
-        "# Train and Evaluate: Using SGD as Optimizer"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "SuizUueqn449"
-      },
-      "outputs": [],
-      "source": [
-        "model_2 = tf.keras.Sequential([\n",
-        "    tf.keras.layers.Dense(64, input_shape=(784,), activation='relu', name='dense_1'),\n",
-        "    tf.keras.layers.Dense(64, activation='relu', name='dense_2'),\n",
-        "    tf.keras.layers.Dense(10, activation='softmax', name='predictions'),\n",
-        "])"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "V8QC3xCwbfNl"
-      },
-      "outputs": [],
-      "source": [
-        "SGD_frobenius_norm_of_weight = []\n",
-        "SGD_get_weight_norm = tf.keras.callbacks.LambdaCallback(\n",
-        "    on_epoch_end=lambda batch, logs: SGD_frobenius_norm_of_weight.append(\n",
-        "        frobenius_norm(model_2.trainable_weights).numpy()))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9BNi4yXGcDlg"
-      },
-      "outputs": [],
-      "source": [
-        "# Compile the model\n",
-        "model_2.compile(\n",
-        "    optimizer=tf.keras.optimizers.SGD(0.01),  # Utilize SGD optimizer\n",
-        "    loss=tf.keras.losses.SparseCategoricalCrossentropy(),\n",
-        "    metrics=['accuracy'])\n",
-        "\n",
-        "history_sgd = model_2.fit(\n",
-        "    x_train,\n",
-        "    y_train,\n",
-        "    batch_size=batch_size,\n",
-        "    validation_data=(x_test, y_test),\n",
-        "    epochs=epochs,\n",
-        "    callbacks=[SGD_get_weight_norm])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1Myw0FVcd_Z9"
-      },
-      "source": [
-        "# Frobenius Norm of Weights: CG vs SGD"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0tJYQBRt-ZUl"
-      },
-      "source": [
-        "The current implementation of CG optimizer is based on Frobenius Norm, with considering Frobenius Norm as regularizer in the target function. Therefore, you compare CG’s regularized effect with SGD optimizer, which has not imposed Frobenius Norm regularizer."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Ewf17MW1cJVI"
-      },
-      "outputs": [],
-      "source": [
-        "plt.plot(\n",
-        "    CG_frobenius_norm_of_weight,\n",
-        "    color='r',\n",
-        "    label='CG_frobenius_norm_of_weights')\n",
-        "plt.plot(\n",
-        "    SGD_frobenius_norm_of_weight,\n",
-        "    color='b',\n",
-        "    label='SGD_frobenius_norm_of_weights')\n",
-        "plt.xlabel('Epoch')\n",
-        "plt.ylabel('Frobenius norm of weights')\n",
-        "plt.legend(loc=1)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "JGtutiXuoZyx"
-      },
-      "source": [
-        "# Train and Validation Accuracy: CG vs SGD\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "s-SNIr10o2va"
-      },
-      "outputs": [],
-      "source": [
-        "plt.plot(history_cg.history['accuracy'], color='r', label='CG_train')\n",
-        "plt.plot(history_cg.history['val_accuracy'], color='g', label='CG_test')\n",
-        "plt.plot(history_sgd.history['accuracy'], color='pink', label='SGD_train')\n",
-        "plt.plot(history_sgd.history['val_accuracy'], color='b', label='SGD_test')\n",
-        "plt.xlabel('Epoch')\n",
-        "plt.ylabel('Accuracy')\n",
-        "plt.legend(loc=4)"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "optimizers_conditionalgradient.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pGUYKbJNWNgj"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors."
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "1PzPJglSWgnW"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "b5P4BEg1XYd5"
+   },
+   "source": [
+    "# TensorFlow Addons Optimizers: ConditionalGradient\n",
+    "\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/optimizers_conditionalgradient\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/optimizers_conditionalgradient.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/optimizers_conditionalgradient.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/optimizers_conditionalgradient.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Faj8luWnYNSG"
+   },
+   "source": [
+    "# Overview\n",
+    "This notebook will demonstrate how to use the Conditional Graident Optimizer from the Addons package."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MrDjqjY6YRYM"
+   },
+   "source": [
+    "# ConditionalGradient\n",
+    "\n",
+    "\n",
+    "> Constraining the parameters of a neural network has been shown to be beneficial in training because of the underlying regularization effects.  Often, parameters are constrained via a soft penalty (which never guarantees the constraint satisfaction) or via a projection operation (which is computationally expensive). Conditional gradient (CG) optimizer, on the other hand, enforces the constraints strictly without the need for an expensive projection step. It works by minimizing a linear approximation of the objective within the constraint set. In this notebook, you demonstrate the appliction of Frobenius norm constraint via the CG optimizer on the MNIST dataset. CG is now available as a tensorflow API. More details of the optimizer are available at https://arxiv.org/pdf/1803.06453.pdf\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "dooBaYGLYYnn"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "2sCyoNXlgGbk"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U tensorflow-addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qYo0FkL4O7io"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow_addons as tfa\n",
+    "from matplotlib import pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "kR0PnjrIirpJ"
+   },
+   "outputs": [],
+   "source": [
+    "# Hyperparameters\n",
+    "batch_size = 64\n",
+    "epochs = 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-x0WBp-IYz7x"
+   },
+   "source": [
+    "# Build the Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "4KzMDUT0i1QE"
+   },
+   "outputs": [],
+   "source": [
+    "model_1 = tf.keras.Sequential(\n",
+    "    [\n",
+    "        tf.keras.layers.Dense(\n",
+    "            64, input_shape=(784,), activation=\"relu\", name=\"dense_1\"\n",
+    "        ),\n",
+    "        tf.keras.layers.Dense(64, activation=\"relu\", name=\"dense_2\"),\n",
+    "        tf.keras.layers.Dense(10, activation=\"softmax\", name=\"predictions\"),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "XGADNG3-Y7aa"
+   },
+   "source": [
+    "# Prep the Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "d6a-kbM_i1b2"
+   },
+   "outputs": [],
+   "source": [
+    "# Load MNIST dataset as NumPy arrays\n",
+    "dataset = {}\n",
+    "num_validation = 10000\n",
+    "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()\n",
+    "\n",
+    "# Preprocess the data\n",
+    "x_train = x_train.reshape(-1, 784).astype(\"float32\") / 255\n",
+    "x_test = x_test.reshape(-1, 784).astype(\"float32\") / 255"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "sOlB-WqjZp1Y"
+   },
+   "source": [
+    "# Define a Custom Callback Function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "8LCmRXUgZqyV"
+   },
+   "outputs": [],
+   "source": [
+    "def frobenius_norm(m):\n",
+    "    \"\"\"This function is to calculate the frobenius norm of the matrix of all\n",
+    "    layer's weight.\n",
+    "\n",
+    "    Args:\n",
+    "        m: is a list of weights param for each layers.\n",
+    "    \"\"\"\n",
+    "    total_reduce_sum = 0\n",
+    "    for i in range(len(m)):\n",
+    "        total_reduce_sum = total_reduce_sum + tf.math.reduce_sum(m[i] ** 2)\n",
+    "    norm = total_reduce_sum ** 0.5\n",
+    "    return norm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "udSvzKm4Z5Zr"
+   },
+   "outputs": [],
+   "source": [
+    "CG_frobenius_norm_of_weight = []\n",
+    "CG_get_weight_norm = tf.keras.callbacks.LambdaCallback(\n",
+    "    on_epoch_end=lambda batch, logs: CG_frobenius_norm_of_weight.append(\n",
+    "        frobenius_norm(model_1.trainable_weights).numpy()\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qfhE1DfwZC1i"
+   },
+   "source": [
+    "# Train and Evaluate: Using CG as Optimizer\n",
+    "\n",
+    "Simply replace typical keras optimizers with the new tfa optimizer "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "6-AMaOYEi1kK"
+   },
+   "outputs": [],
+   "source": [
+    "# Compile the model\n",
+    "model_1.compile(\n",
+    "    optimizer=tfa.optimizers.ConditionalGradient(\n",
+    "        learning_rate=0.99949, lambda_=203\n",
+    "    ),  # Utilize TFA optimizer\n",
+    "    loss=tf.keras.losses.SparseCategoricalCrossentropy(),\n",
+    "    metrics=[\"accuracy\"],\n",
+    ")\n",
+    "\n",
+    "history_cg = model_1.fit(\n",
+    "    x_train,\n",
+    "    y_train,\n",
+    "    batch_size=batch_size,\n",
+    "    validation_data=(x_test, y_test),\n",
+    "    epochs=epochs,\n",
+    "    callbacks=[CG_get_weight_norm],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8OJp4So9bYYR"
+   },
+   "source": [
+    "# Train and Evaluate: Using SGD as Optimizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "SuizUueqn449"
+   },
+   "outputs": [],
+   "source": [
+    "model_2 = tf.keras.Sequential(\n",
+    "    [\n",
+    "        tf.keras.layers.Dense(\n",
+    "            64, input_shape=(784,), activation=\"relu\", name=\"dense_1\"\n",
+    "        ),\n",
+    "        tf.keras.layers.Dense(64, activation=\"relu\", name=\"dense_2\"),\n",
+    "        tf.keras.layers.Dense(10, activation=\"softmax\", name=\"predictions\"),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "V8QC3xCwbfNl"
+   },
+   "outputs": [],
+   "source": [
+    "SGD_frobenius_norm_of_weight = []\n",
+    "SGD_get_weight_norm = tf.keras.callbacks.LambdaCallback(\n",
+    "    on_epoch_end=lambda batch, logs: SGD_frobenius_norm_of_weight.append(\n",
+    "        frobenius_norm(model_2.trainable_weights).numpy()\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "9BNi4yXGcDlg"
+   },
+   "outputs": [],
+   "source": [
+    "# Compile the model\n",
+    "model_2.compile(\n",
+    "    optimizer=tf.keras.optimizers.SGD(0.01),  # Utilize SGD optimizer\n",
+    "    loss=tf.keras.losses.SparseCategoricalCrossentropy(),\n",
+    "    metrics=[\"accuracy\"],\n",
+    ")\n",
+    "\n",
+    "history_sgd = model_2.fit(\n",
+    "    x_train,\n",
+    "    y_train,\n",
+    "    batch_size=batch_size,\n",
+    "    validation_data=(x_test, y_test),\n",
+    "    epochs=epochs,\n",
+    "    callbacks=[SGD_get_weight_norm],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1Myw0FVcd_Z9"
+   },
+   "source": [
+    "# Frobenius Norm of Weights: CG vs SGD"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0tJYQBRt-ZUl"
+   },
+   "source": [
+    "The current implementation of CG optimizer is based on Frobenius Norm, with considering Frobenius Norm as regularizer in the target function. Therefore, you compare CG’s regularized effect with SGD optimizer, which has not imposed Frobenius Norm regularizer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Ewf17MW1cJVI"
+   },
+   "outputs": [],
+   "source": [
+    "plt.plot(CG_frobenius_norm_of_weight, color=\"r\", label=\"CG_frobenius_norm_of_weights\")\n",
+    "plt.plot(SGD_frobenius_norm_of_weight, color=\"b\", label=\"SGD_frobenius_norm_of_weights\")\n",
+    "plt.xlabel(\"Epoch\")\n",
+    "plt.ylabel(\"Frobenius norm of weights\")\n",
+    "plt.legend(loc=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "JGtutiXuoZyx"
+   },
+   "source": [
+    "# Train and Validation Accuracy: CG vs SGD\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "s-SNIr10o2va"
+   },
+   "outputs": [],
+   "source": [
+    "plt.plot(history_cg.history[\"accuracy\"], color=\"r\", label=\"CG_train\")\n",
+    "plt.plot(history_cg.history[\"val_accuracy\"], color=\"g\", label=\"CG_test\")\n",
+    "plt.plot(history_sgd.history[\"accuracy\"], color=\"pink\", label=\"SGD_train\")\n",
+    "plt.plot(history_sgd.history[\"val_accuracy\"], color=\"b\", label=\"SGD_test\")\n",
+    "plt.xlabel(\"Epoch\")\n",
+    "plt.ylabel(\"Accuracy\")\n",
+    "plt.legend(loc=4)"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "optimizers_conditionalgradient.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/optimizers_cyclicallearningrate.ipynb b/docs/tutorials/optimizers_cyclicallearningrate.ipynb
index 8e63a0e0fa..b378768f23 100644
--- a/docs/tutorials/optimizers_cyclicallearningrate.ipynb
+++ b/docs/tutorials/optimizers_cyclicallearningrate.ipynb
@@ -1,452 +1,459 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Tce3stUlHN0L"
-      },
-      "source": [
-        "##### Copyright 2021 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "tuOe1ymfHZPu"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qFdPvlXBOdUN"
-      },
-      "source": [
-        "# TensorFlow Addons Optimizers: CyclicalLearningRate"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MfBg1C5NB3X0"
-      },
-      "source": [
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/optimizers_cyclicallearningrate\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/optimizers_cyclicallearningrate.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/optimizers_cyclicallearningrate.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "      <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/optimizers_cyclicallearningrate.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xHxb-dlhMIzW"
-      },
-      "source": [
-        "## Overview\n",
-        "\n",
-        "This tutorial demonstrates the use of Cyclical Learning Rate from the Addons package."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "IqEImEhBJWFv"
-      },
-      "source": [
-        "## Cyclical Learning Rates\n",
-        "\n",
-        "It has been shown it is beneficial to adjust the learning rate as training progresses for a neural network. It has manifold benefits ranging from saddle point recovery to preventing numerical instabilities that may arise during backpropagation. But how does one know how much to adjust with respect to a particular training timestamp? In 2015, Leslie Smith noticed that you would want to increase the learning rate to traverse faster across the loss landscape but you would also want to reduce the learning rate when approaching convergence. To realize this idea, he proposed [Cyclical Learning Rates](https://arxiv.org/abs/1506.01186) (CLR) where you would adjust the learning rate with respect to the cycles of a function. For a visual demonstration, you can check out [this blog](https://www.jeremyjordan.me/nn-learning-rate/). CLR is now available as a TensorFlow API. For more details, check out the original paper [here](https://arxiv.org/abs/1506.01186). "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MUXex9ctTuDB"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "t-p545dluzjI"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -q -U tensorflow_addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "RPF3aDZZu8le"
-      },
-      "outputs": [],
-      "source": [
-        "from tensorflow.keras import layers\n",
-        "import tensorflow_addons as tfa\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "import numpy as np\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "tf.random.set_seed(42)\n",
-        "np.random.seed(42)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XLOnLLrlR-ti"
-      },
-      "source": [
-        "## Load and prepare dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "uAHLo_Ffvie3"
-      },
-      "outputs": [],
-      "source": [
-        "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()\n",
-        "\n",
-        "x_train = np.expand_dims(x_train, -1)\n",
-        "x_test = np.expand_dims(x_test, -1)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "AfUS_s-uSBvx"
-      },
-      "source": [
-        "## Define hyperparameters"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "qumJ7KpwvvwE"
-      },
-      "outputs": [],
-      "source": [
-        "BATCH_SIZE = 64\n",
-        "EPOCHS = 10\n",
-        "INIT_LR = 1e-4\n",
-        "MAX_LR = 1e-2"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "G-x3E7RWSXWc"
-      },
-      "source": [
-        "## Define model building and model training utilities"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "vni6Gz3Dv9Db"
-      },
-      "outputs": [],
-      "source": [
-        "def get_training_model():\n",
-        "    model = tf.keras.Sequential(\n",
-        "        [\n",
-        "            layers.InputLayer((28, 28, 1)),\n",
-        "            layers.experimental.preprocessing.Rescaling(scale=1./255),\n",
-        "            layers.Conv2D(16, (5, 5), activation=\"relu\"),\n",
-        "            layers.MaxPooling2D(pool_size=(2, 2)),\n",
-        "            layers.Conv2D(32, (5, 5), activation=\"relu\"),\n",
-        "            layers.MaxPooling2D(pool_size=(2, 2)),\n",
-        "            layers.SpatialDropout2D(0.2),\n",
-        "            layers.GlobalAvgPool2D(),\n",
-        "            layers.Dense(128, activation=\"relu\"),\n",
-        "            layers.Dense(10, activation=\"softmax\"),\n",
-        "        ]\n",
-        "    )\n",
-        "    return model\n",
-        "\n",
-        "def train_model(model, optimizer):\n",
-        "    model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n",
-        "                       metrics=[\"accuracy\"])\n",
-        "    history = model.fit(x_train,\n",
-        "        y_train,\n",
-        "        batch_size=BATCH_SIZE,\n",
-        "        validation_data=(x_test, y_test),\n",
-        "        epochs=EPOCHS)\n",
-        "    return history"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RlRKWRWrSk_t"
-      },
-      "source": [
-        "In the interest of reproducibility, the initial model weights are serialized which you will be using to conduct our experiments. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "-JxnpsIzwCgj"
-      },
-      "outputs": [],
-      "source": [
-        "initial_model = get_training_model()\n",
-        "initial_model.save(\"initial_model\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "oNF33-tBSuFG"
-      },
-      "source": [
-        "## Train a model without CLR"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Q4dEJtQzwjei"
-      },
-      "outputs": [],
-      "source": [
-        "standard_model = tf.keras.models.load_model(\"initial_model\")\n",
-        "no_clr_history = train_model(standard_model, optimizer=\"sgd\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "eaK0PAN-Sy6l"
-      },
-      "source": [
-        "## Define CLR schedule\n",
-        "\n",
-        "The `tfa.optimizers.CyclicalLearningRate` module return a direct schedule that can be passed to an optimizer. The schedule takes a step as its input and outputs a value calculated using CLR formula as laid out in the paper. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ne0b8aGNyc3v"
-      },
-      "outputs": [],
-      "source": [
-        "steps_per_epoch = len(x_train) // BATCH_SIZE\n",
-        "clr = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=INIT_LR,\n",
-        "    maximal_learning_rate=MAX_LR,\n",
-        "    scale_fn=lambda x: 1/(2.**(x-1)),\n",
-        "    step_size=2 * steps_per_epoch\n",
-        ")\n",
-        "optimizer = tf.keras.optimizers.SGD(clr)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "icVL3hsUTwXV"
-      },
-      "source": [
-        "Here, you specify the lower and upper bounds of the learning rate and the schedule will *oscillate* in between that range ([1e-4, 1e-2] in this case). `scale_fn` is used to define the function that would scale up and scale down the learning rate within a given cycle. `step_size` defines the duration of a single cycle. A `step_size` of 2 means you need a total of 4 iterations to complete one cycle. The recommended value for `step_size` is as follows:\n",
-        "\n",
-        "`factor * steps_per_epoch` where factor lies within the [2, 8] range. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "5JV_ESYqUb4d"
-      },
-      "source": [
-        "In the same [CLR paper](https://arxiv.org/abs/1506.01186), Leslie also presented a simple and elegant method to choose the bounds for learning rate. You are encouraged to check it out as well. [This blog post](https://www.pyimagesearch.com/2019/08/05/keras-learning-rate-finder/) provides a nice introduction to the method. \n",
-        "\n",
-        "Below, you visualize how the `clr` schedule looks like. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "b_WRGfDx4Wwc"
-      },
-      "outputs": [],
-      "source": [
-        "step = np.arange(0, EPOCHS * steps_per_epoch)\n",
-        "lr = clr(step)\n",
-        "plt.plot(step, lr)\n",
-        "plt.xlabel(\"Steps\")\n",
-        "plt.ylabel(\"Learning Rate\")\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "bBlKaAqNjHP1"
-      },
-      "source": [
-        "In order to better visualize the effect of CLR, you can plot the schedule with an increased number of steps. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Gjhoyk-Li368"
-      },
-      "outputs": [],
-      "source": [
-        "step = np.arange(0, 100 * steps_per_epoch)\n",
-        "lr = clr(step)\n",
-        "plt.plot(step, lr)\n",
-        "plt.xlabel(\"Steps\")\n",
-        "plt.ylabel(\"Learning Rate\")\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ObYcy5NRkF4V"
-      },
-      "source": [
-        "The function you are using in this tutorial is referred to as the `triangular2` method in the CLR paper. There are other two functions there were explored namely `triangular` and `exp` (short for exponential). "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-OV_8QVIe5m_"
-      },
-      "source": [
-        "## Train a model with CLR"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "zRSglElvy_fF"
-      },
-      "outputs": [],
-      "source": [
-        "clr_model = tf.keras.models.load_model(\"initial_model\")\n",
-        "clr_history = train_model(clr_model, optimizer=optimizer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8rhTLQJdnGfP"
-      },
-      "source": [
-        "As expected the loss starts higher than the usual and then it stabilizes as the cycles progress. You can confirm this visually with the plots below. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "LyHEgnv6e8lX"
-      },
-      "source": [
-        "## Visualize losses"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "wg0JjLwH2RTl"
-      },
-      "outputs": [],
-      "source": [
-        "(fig, ax) = plt.subplots(2, 1, figsize=(10, 8))\n",
-        "\n",
-        "ax[0].plot(no_clr_history.history[\"loss\"], label=\"train_loss\")\n",
-        "ax[0].plot(no_clr_history.history[\"val_loss\"], label=\"val_loss\")\n",
-        "ax[0].set_title(\"No CLR\")\n",
-        "ax[0].set_xlabel(\"Epochs\")\n",
-        "ax[0].set_ylabel(\"Loss\")\n",
-        "ax[0].set_ylim([0, 2.5])\n",
-        "ax[0].legend()\n",
-        "\n",
-        "ax[1].plot(clr_history.history[\"loss\"], label=\"train_loss\")\n",
-        "ax[1].plot(clr_history.history[\"val_loss\"], label=\"val_loss\")\n",
-        "ax[1].set_title(\"CLR\")\n",
-        "ax[1].set_xlabel(\"Epochs\")\n",
-        "ax[1].set_ylabel(\"Loss\")\n",
-        "ax[1].set_ylim([0, 2.5])\n",
-        "ax[1].legend()\n",
-        "\n",
-        "fig.tight_layout(pad=3.0)\n",
-        "fig.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2EwZuz_pkqLM"
-      },
-      "source": [
-        "Even though for this toy example, you did not see the effects of CLR much but be noted that it is one of the main ingredients behind [Super Convergence](https://arxiv.org/abs/1708.07120) and can have a [really good impact](https://www.fast.ai/2018/08/10/fastai-diu-imagenet/) when training in large-scale settings. "
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "optimizers_cyclicallearningrate.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Tce3stUlHN0L"
+   },
+   "source": [
+    "##### Copyright 2021 The TensorFlow Authors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "tuOe1ymfHZPu"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qFdPvlXBOdUN"
+   },
+   "source": [
+    "# TensorFlow Addons Optimizers: CyclicalLearningRate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MfBg1C5NB3X0"
+   },
+   "source": [
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/optimizers_cyclicallearningrate\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/optimizers_cyclicallearningrate.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/optimizers_cyclicallearningrate.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "      <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/optimizers_cyclicallearningrate.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "xHxb-dlhMIzW"
+   },
+   "source": [
+    "## Overview\n",
+    "\n",
+    "This tutorial demonstrates the use of Cyclical Learning Rate from the Addons package."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "IqEImEhBJWFv"
+   },
+   "source": [
+    "## Cyclical Learning Rates\n",
+    "\n",
+    "It has been shown it is beneficial to adjust the learning rate as training progresses for a neural network. It has manifold benefits ranging from saddle point recovery to preventing numerical instabilities that may arise during backpropagation. But how does one know how much to adjust with respect to a particular training timestamp? In 2015, Leslie Smith noticed that you would want to increase the learning rate to traverse faster across the loss landscape but you would also want to reduce the learning rate when approaching convergence. To realize this idea, he proposed [Cyclical Learning Rates](https://arxiv.org/abs/1506.01186) (CLR) where you would adjust the learning rate with respect to the cycles of a function. For a visual demonstration, you can check out [this blog](https://www.jeremyjordan.me/nn-learning-rate/). CLR is now available as a TensorFlow API. For more details, check out the original paper [here](https://arxiv.org/abs/1506.01186). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MUXex9ctTuDB"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "t-p545dluzjI"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q -U tensorflow_addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "RPF3aDZZu8le"
+   },
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras import layers\n",
+    "import tensorflow_addons as tfa\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "tf.random.set_seed(42)\n",
+    "np.random.seed(42)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "XLOnLLrlR-ti"
+   },
+   "source": [
+    "## Load and prepare dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "uAHLo_Ffvie3"
+   },
+   "outputs": [],
+   "source": [
+    "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()\n",
+    "\n",
+    "x_train = np.expand_dims(x_train, -1)\n",
+    "x_test = np.expand_dims(x_test, -1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "AfUS_s-uSBvx"
+   },
+   "source": [
+    "## Define hyperparameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qumJ7KpwvvwE"
+   },
+   "outputs": [],
+   "source": [
+    "BATCH_SIZE = 64\n",
+    "EPOCHS = 10\n",
+    "INIT_LR = 1e-4\n",
+    "MAX_LR = 1e-2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "G-x3E7RWSXWc"
+   },
+   "source": [
+    "## Define model building and model training utilities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "vni6Gz3Dv9Db"
+   },
+   "outputs": [],
+   "source": [
+    "def get_training_model():\n",
+    "    model = tf.keras.Sequential(\n",
+    "        [\n",
+    "            layers.InputLayer((28, 28, 1)),\n",
+    "            layers.experimental.preprocessing.Rescaling(scale=1.0 / 255),\n",
+    "            layers.Conv2D(16, (5, 5), activation=\"relu\"),\n",
+    "            layers.MaxPooling2D(pool_size=(2, 2)),\n",
+    "            layers.Conv2D(32, (5, 5), activation=\"relu\"),\n",
+    "            layers.MaxPooling2D(pool_size=(2, 2)),\n",
+    "            layers.SpatialDropout2D(0.2),\n",
+    "            layers.GlobalAvgPool2D(),\n",
+    "            layers.Dense(128, activation=\"relu\"),\n",
+    "            layers.Dense(10, activation=\"softmax\"),\n",
+    "        ]\n",
+    "    )\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def train_model(model, optimizer):\n",
+    "    model.compile(\n",
+    "        loss=\"sparse_categorical_crossentropy\",\n",
+    "        optimizer=optimizer,\n",
+    "        metrics=[\"accuracy\"],\n",
+    "    )\n",
+    "    history = model.fit(\n",
+    "        x_train,\n",
+    "        y_train,\n",
+    "        batch_size=BATCH_SIZE,\n",
+    "        validation_data=(x_test, y_test),\n",
+    "        epochs=EPOCHS,\n",
+    "    )\n",
+    "    return history"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "RlRKWRWrSk_t"
+   },
+   "source": [
+    "In the interest of reproducibility, the initial model weights are serialized which you will be using to conduct our experiments. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "-JxnpsIzwCgj"
+   },
+   "outputs": [],
+   "source": [
+    "initial_model = get_training_model()\n",
+    "initial_model.save(\"initial_model\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "oNF33-tBSuFG"
+   },
+   "source": [
+    "## Train a model without CLR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Q4dEJtQzwjei"
+   },
+   "outputs": [],
+   "source": [
+    "standard_model = tf.keras.models.load_model(\"initial_model\")\n",
+    "no_clr_history = train_model(standard_model, optimizer=\"sgd\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "eaK0PAN-Sy6l"
+   },
+   "source": [
+    "## Define CLR schedule\n",
+    "\n",
+    "The `tfa.optimizers.CyclicalLearningRate` module return a direct schedule that can be passed to an optimizer. The schedule takes a step as its input and outputs a value calculated using CLR formula as laid out in the paper. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ne0b8aGNyc3v"
+   },
+   "outputs": [],
+   "source": [
+    "steps_per_epoch = len(x_train) // BATCH_SIZE\n",
+    "clr = tfa.optimizers.CyclicalLearningRate(\n",
+    "    initial_learning_rate=INIT_LR,\n",
+    "    maximal_learning_rate=MAX_LR,\n",
+    "    scale_fn=lambda x: 1 / (2.0 ** (x - 1)),\n",
+    "    step_size=2 * steps_per_epoch,\n",
+    ")\n",
+    "optimizer = tf.keras.optimizers.SGD(clr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "icVL3hsUTwXV"
+   },
+   "source": [
+    "Here, you specify the lower and upper bounds of the learning rate and the schedule will *oscillate* in between that range ([1e-4, 1e-2] in this case). `scale_fn` is used to define the function that would scale up and scale down the learning rate within a given cycle. `step_size` defines the duration of a single cycle. A `step_size` of 2 means you need a total of 4 iterations to complete one cycle. The recommended value for `step_size` is as follows:\n",
+    "\n",
+    "`factor * steps_per_epoch` where factor lies within the [2, 8] range. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5JV_ESYqUb4d"
+   },
+   "source": [
+    "In the same [CLR paper](https://arxiv.org/abs/1506.01186), Leslie also presented a simple and elegant method to choose the bounds for learning rate. You are encouraged to check it out as well. [This blog post](https://www.pyimagesearch.com/2019/08/05/keras-learning-rate-finder/) provides a nice introduction to the method. \n",
+    "\n",
+    "Below, you visualize how the `clr` schedule looks like. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "b_WRGfDx4Wwc"
+   },
+   "outputs": [],
+   "source": [
+    "step = np.arange(0, EPOCHS * steps_per_epoch)\n",
+    "lr = clr(step)\n",
+    "plt.plot(step, lr)\n",
+    "plt.xlabel(\"Steps\")\n",
+    "plt.ylabel(\"Learning Rate\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "bBlKaAqNjHP1"
+   },
+   "source": [
+    "In order to better visualize the effect of CLR, you can plot the schedule with an increased number of steps. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Gjhoyk-Li368"
+   },
+   "outputs": [],
+   "source": [
+    "step = np.arange(0, 100 * steps_per_epoch)\n",
+    "lr = clr(step)\n",
+    "plt.plot(step, lr)\n",
+    "plt.xlabel(\"Steps\")\n",
+    "plt.ylabel(\"Learning Rate\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ObYcy5NRkF4V"
+   },
+   "source": [
+    "The function you are using in this tutorial is referred to as the `triangular2` method in the CLR paper. There are other two functions there were explored namely `triangular` and `exp` (short for exponential). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-OV_8QVIe5m_"
+   },
+   "source": [
+    "## Train a model with CLR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "zRSglElvy_fF"
+   },
+   "outputs": [],
+   "source": [
+    "clr_model = tf.keras.models.load_model(\"initial_model\")\n",
+    "clr_history = train_model(clr_model, optimizer=optimizer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8rhTLQJdnGfP"
+   },
+   "source": [
+    "As expected the loss starts higher than the usual and then it stabilizes as the cycles progress. You can confirm this visually with the plots below. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LyHEgnv6e8lX"
+   },
+   "source": [
+    "## Visualize losses"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "wg0JjLwH2RTl"
+   },
+   "outputs": [],
+   "source": [
+    "(fig, ax) = plt.subplots(2, 1, figsize=(10, 8))\n",
+    "\n",
+    "ax[0].plot(no_clr_history.history[\"loss\"], label=\"train_loss\")\n",
+    "ax[0].plot(no_clr_history.history[\"val_loss\"], label=\"val_loss\")\n",
+    "ax[0].set_title(\"No CLR\")\n",
+    "ax[0].set_xlabel(\"Epochs\")\n",
+    "ax[0].set_ylabel(\"Loss\")\n",
+    "ax[0].set_ylim([0, 2.5])\n",
+    "ax[0].legend()\n",
+    "\n",
+    "ax[1].plot(clr_history.history[\"loss\"], label=\"train_loss\")\n",
+    "ax[1].plot(clr_history.history[\"val_loss\"], label=\"val_loss\")\n",
+    "ax[1].set_title(\"CLR\")\n",
+    "ax[1].set_xlabel(\"Epochs\")\n",
+    "ax[1].set_ylabel(\"Loss\")\n",
+    "ax[1].set_ylim([0, 2.5])\n",
+    "ax[1].legend()\n",
+    "\n",
+    "fig.tight_layout(pad=3.0)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2EwZuz_pkqLM"
+   },
+   "source": [
+    "Even though for this toy example, you did not see the effects of CLR much but be noted that it is one of the main ingredients behind [Super Convergence](https://arxiv.org/abs/1708.07120) and can have a [really good impact](https://www.fast.ai/2018/08/10/fastai-diu-imagenet/) when training in large-scale settings. "
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "optimizers_cyclicallearningrate.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/optimizers_lazyadam.ipynb b/docs/tutorials/optimizers_lazyadam.ipynb
index 90d0de2370..f50539535b 100644
--- a/docs/tutorials/optimizers_lazyadam.ipynb
+++ b/docs/tutorials/optimizers_lazyadam.ipynb
@@ -1,247 +1,248 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Tce3stUlHN0L"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "tuOe1ymfHZPu"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MfBg1C5NB3X0"
-      },
-      "source": [
-        "# TensorFlow Addons Optimizers: LazyAdam\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/optimizers_lazyadam\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/optimizers_lazyadam.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/optimizers_lazyadam.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/optimizers_lazyadam.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xHxb-dlhMIzW"
-      },
-      "source": [
-        "## Overview\n",
-        "\n",
-        "This notebook will demonstrate how to use the lazy adam optimizer from the Addons package.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "bQwBbFVAyHJ_"
-      },
-      "source": [
-        "## LazyAdam\n",
-        "\n",
-        "> LazyAdam is a variant of the Adam optimizer that handles sparse updates more efficiently.\n",
-        "    The original Adam algorithm maintains two moving-average accumulators for\n",
-        "    each trainable variable; the accumulators are updated at every step.\n",
-        "    This class provides lazier handling of gradient updates for sparse\n",
-        "    variables.  It only updates moving-average accumulators for sparse variable\n",
-        "    indices that appear in the current batch, rather than updating the\n",
-        "    accumulators for all indices. Compared with the original Adam optimizer,\n",
-        "    it can provide large improvements in model training throughput for some\n",
-        "    applications. However, it provides slightly different semantics than the\n",
-        "    original Adam algorithm, and may lead to different empirical results."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MUXex9ctTuDB"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cHAOyeOVx-k3"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -U tensorflow-addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "42ztALK4ZdyZ"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf\n",
-        "import tensorflow_addons as tfa"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ys65MwOLKnXq"
-      },
-      "outputs": [],
-      "source": [
-        "# Hyperparameters\n",
-        "batch_size=64\n",
-        "epochs=10"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KR01t9v_fxbT"
-      },
-      "source": [
-        "## Build the Model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "djpoAvfWNyL5"
-      },
-      "outputs": [],
-      "source": [
-        "model = tf.keras.Sequential([\n",
-        "    tf.keras.layers.Dense(64, input_shape=(784,), activation='relu', name='dense_1'),\n",
-        "    tf.keras.layers.Dense(64, activation='relu', name='dense_2'),\n",
-        "    tf.keras.layers.Dense(10, activation='softmax', name='predictions'),\n",
-        "])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0_D7CZqkv_Hj"
-      },
-      "source": [
-        "## Prepare the Data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "U0bS3SyowBoB"
-      },
-      "outputs": [],
-      "source": [
-        "# Load MNIST dataset as NumPy arrays\n",
-        "dataset = {}\n",
-        "num_validation = 10000\n",
-        "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()\n",
-        "\n",
-        "# Preprocess the data\n",
-        "x_train = x_train.reshape(-1, 784).astype('float32') / 255\n",
-        "x_test = x_test.reshape(-1, 784).astype('float32') / 255"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "HYE-BxhOzFQp"
-      },
-      "source": [
-        "## Train and Evaluate\n",
-        "\n",
-        "Simply replace typical keras optimizers with the new tfa optimizer "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NxfYhtiSzHf-"
-      },
-      "outputs": [],
-      "source": [
-        "# Compile the model\n",
-        "model.compile(\n",
-        "    optimizer=tfa.optimizers.LazyAdam(0.001),  # Utilize TFA optimizer\n",
-        "    loss=tf.keras.losses.SparseCategoricalCrossentropy(),\n",
-        "    metrics=['accuracy'])\n",
-        "\n",
-        "# Train the network\n",
-        "history = model.fit(\n",
-        "    x_train,\n",
-        "    y_train,\n",
-        "    batch_size=batch_size,\n",
-        "    epochs=epochs)\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "1Y--0tK69SXf"
-      },
-      "outputs": [],
-      "source": [
-        "# Evaluate the network\n",
-        "print('Evaluate on test data:')\n",
-        "results = model.evaluate(x_test, y_test, batch_size=128, verbose = 2)\n",
-        "print('Test loss = {0}, Test acc: {1}'.format(results[0], results[1]))"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "optimizers_lazyadam.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Tce3stUlHN0L"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "tuOe1ymfHZPu"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MfBg1C5NB3X0"
+   },
+   "source": [
+    "# TensorFlow Addons Optimizers: LazyAdam\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/optimizers_lazyadam\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/optimizers_lazyadam.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/optimizers_lazyadam.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/optimizers_lazyadam.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "xHxb-dlhMIzW"
+   },
+   "source": [
+    "## Overview\n",
+    "\n",
+    "This notebook will demonstrate how to use the lazy adam optimizer from the Addons package.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "bQwBbFVAyHJ_"
+   },
+   "source": [
+    "## LazyAdam\n",
+    "\n",
+    "> LazyAdam is a variant of the Adam optimizer that handles sparse updates more efficiently.\n",
+    "    The original Adam algorithm maintains two moving-average accumulators for\n",
+    "    each trainable variable; the accumulators are updated at every step.\n",
+    "    This class provides lazier handling of gradient updates for sparse\n",
+    "    variables.  It only updates moving-average accumulators for sparse variable\n",
+    "    indices that appear in the current batch, rather than updating the\n",
+    "    accumulators for all indices. Compared with the original Adam optimizer,\n",
+    "    it can provide large improvements in model training throughput for some\n",
+    "    applications. However, it provides slightly different semantics than the\n",
+    "    original Adam algorithm, and may lead to different empirical results."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MUXex9ctTuDB"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "cHAOyeOVx-k3"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U tensorflow-addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "42ztALK4ZdyZ"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow_addons as tfa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ys65MwOLKnXq"
+   },
+   "outputs": [],
+   "source": [
+    "# Hyperparameters\n",
+    "batch_size = 64\n",
+    "epochs = 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "KR01t9v_fxbT"
+   },
+   "source": [
+    "## Build the Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "djpoAvfWNyL5"
+   },
+   "outputs": [],
+   "source": [
+    "model = tf.keras.Sequential(\n",
+    "    [\n",
+    "        tf.keras.layers.Dense(\n",
+    "            64, input_shape=(784,), activation=\"relu\", name=\"dense_1\"\n",
+    "        ),\n",
+    "        tf.keras.layers.Dense(64, activation=\"relu\", name=\"dense_2\"),\n",
+    "        tf.keras.layers.Dense(10, activation=\"softmax\", name=\"predictions\"),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0_D7CZqkv_Hj"
+   },
+   "source": [
+    "## Prepare the Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "U0bS3SyowBoB"
+   },
+   "outputs": [],
+   "source": [
+    "# Load MNIST dataset as NumPy arrays\n",
+    "dataset = {}\n",
+    "num_validation = 10000\n",
+    "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()\n",
+    "\n",
+    "# Preprocess the data\n",
+    "x_train = x_train.reshape(-1, 784).astype(\"float32\") / 255\n",
+    "x_test = x_test.reshape(-1, 784).astype(\"float32\") / 255"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HYE-BxhOzFQp"
+   },
+   "source": [
+    "## Train and Evaluate\n",
+    "\n",
+    "Simply replace typical keras optimizers with the new tfa optimizer "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NxfYhtiSzHf-"
+   },
+   "outputs": [],
+   "source": [
+    "# Compile the model\n",
+    "model.compile(\n",
+    "    optimizer=tfa.optimizers.LazyAdam(0.001),  # Utilize TFA optimizer\n",
+    "    loss=tf.keras.losses.SparseCategoricalCrossentropy(),\n",
+    "    metrics=[\"accuracy\"],\n",
+    ")\n",
+    "\n",
+    "# Train the network\n",
+    "history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "1Y--0tK69SXf"
+   },
+   "outputs": [],
+   "source": [
+    "# Evaluate the network\n",
+    "print(\"Evaluate on test data:\")\n",
+    "results = model.evaluate(x_test, y_test, batch_size=128, verbose=2)\n",
+    "print(\"Test loss = {0}, Test acc: {1}\".format(results[0], results[1]))"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "optimizers_lazyadam.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/time_stopping.ipynb b/docs/tutorials/time_stopping.ipynb
index bbcc83d673..7e74ecdee1 100644
--- a/docs/tutorials/time_stopping.ipynb
+++ b/docs/tutorials/time_stopping.ipynb
@@ -1,205 +1,208 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mz0tl581YjZ0"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "hi0OrWAIYjZ4"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "gyGdPCvQYjaI"
-      },
-      "source": [
-        "# TensorFlow Addons Callbacks: TimeStopping"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Z5csJXPVYjaM"
-      },
-      "source": [
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/time_stopping\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/time_stopping.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/time_stopping.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "      <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/time_stopping.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BJhody3KYjaP"
-      },
-      "source": [
-        "## Overview\n",
-        "This notebook will demonstrate how to use TimeStopping Callback in TensorFlow Addons."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "SaZsCaGbYjaU"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "VgJGPL3ts_1i"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -U tensorflow-addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "fm_dHPvEYjar"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow_addons as tfa\n",
-        "\n",
-        "from tensorflow.keras.datasets import mnist\n",
-        "from tensorflow.keras.models import Sequential\n",
-        "from tensorflow.keras.layers import Dense, Dropout, Flatten"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "vg0y1DrQYja4"
-      },
-      "source": [
-        "## Import and Normalize Data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "HydkzZTuYja8"
-      },
-      "outputs": [],
-      "source": [
-        "# the data, split between train and test sets\n",
-        "(x_train, y_train), (x_test, y_test) = mnist.load_data()\n",
-        "# normalize data\n",
-        "x_train, x_test = x_train / 255.0, x_test / 255.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uX02I1kxYjbL"
-      },
-      "source": [
-        "## Build Simple MNIST CNN Model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Tlk0MyEfYjbN"
-      },
-      "outputs": [],
-      "source": [
-        "# build the model using the Sequential API\n",
-        "model = Sequential()\n",
-        "model.add(Flatten(input_shape=(28, 28)))\n",
-        "model.add(Dense(128, activation='relu'))\n",
-        "model.add(Dropout(0.2))\n",
-        "model.add(Dense(10, activation='softmax'))\n",
-        "\n",
-        "model.compile(optimizer='adam',\n",
-        "              loss = 'sparse_categorical_crossentropy',\n",
-        "              metrics=['accuracy'])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "b5Xcyt0qYjbX"
-      },
-      "source": [
-        "## Simple TimeStopping Usage"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "W82_IZ6iYjbZ"
-      },
-      "outputs": [],
-      "source": [
-        "# initialize TimeStopping callback \n",
-        "time_stopping_callback = tfa.callbacks.TimeStopping(seconds=5, verbose=1)\n",
-        "\n",
-        "# train the model with tqdm_callback\n",
-        "# make sure to set verbose = 0 to disable\n",
-        "# the default progress bar.\n",
-        "model.fit(x_train, y_train,\n",
-        "          batch_size=64,\n",
-        "          epochs=100,\n",
-        "          callbacks=[time_stopping_callback],\n",
-        "          validation_data=(x_test, y_test))"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "name": "time_stopping.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mz0tl581YjZ0"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors."
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "hi0OrWAIYjZ4"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "gyGdPCvQYjaI"
+   },
+   "source": [
+    "# TensorFlow Addons Callbacks: TimeStopping"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Z5csJXPVYjaM"
+   },
+   "source": [
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/time_stopping\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/time_stopping.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/time_stopping.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "      <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/time_stopping.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "BJhody3KYjaP"
+   },
+   "source": [
+    "## Overview\n",
+    "This notebook will demonstrate how to use TimeStopping Callback in TensorFlow Addons."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "SaZsCaGbYjaU"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "VgJGPL3ts_1i"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U tensorflow-addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "fm_dHPvEYjar"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow_addons as tfa\n",
+    "\n",
+    "from tensorflow.keras.datasets import mnist\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import Dense, Dropout, Flatten"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "vg0y1DrQYja4"
+   },
+   "source": [
+    "## Import and Normalize Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "HydkzZTuYja8"
+   },
+   "outputs": [],
+   "source": [
+    "# the data, split between train and test sets\n",
+    "(x_train, y_train), (x_test, y_test) = mnist.load_data()\n",
+    "# normalize data\n",
+    "x_train, x_test = x_train / 255.0, x_test / 255.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uX02I1kxYjbL"
+   },
+   "source": [
+    "## Build Simple MNIST CNN Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Tlk0MyEfYjbN"
+   },
+   "outputs": [],
+   "source": [
+    "# build the model using the Sequential API\n",
+    "model = Sequential()\n",
+    "model.add(Flatten(input_shape=(28, 28)))\n",
+    "model.add(Dense(128, activation=\"relu\"))\n",
+    "model.add(Dropout(0.2))\n",
+    "model.add(Dense(10, activation=\"softmax\"))\n",
+    "\n",
+    "model.compile(\n",
+    "    optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", metrics=[\"accuracy\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "b5Xcyt0qYjbX"
+   },
+   "source": [
+    "## Simple TimeStopping Usage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "W82_IZ6iYjbZ"
+   },
+   "outputs": [],
+   "source": [
+    "# initialize TimeStopping callback\n",
+    "time_stopping_callback = tfa.callbacks.TimeStopping(seconds=5, verbose=1)\n",
+    "\n",
+    "# train the model with tqdm_callback\n",
+    "# make sure to set verbose = 0 to disable\n",
+    "# the default progress bar.\n",
+    "model.fit(\n",
+    "    x_train,\n",
+    "    y_train,\n",
+    "    batch_size=64,\n",
+    "    epochs=100,\n",
+    "    callbacks=[time_stopping_callback],\n",
+    "    validation_data=(x_test, y_test),\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "name": "time_stopping.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/docs/tutorials/tqdm_progress_bar.ipynb b/docs/tutorials/tqdm_progress_bar.ipynb
index 7a3927d7ed..6b5490625d 100644
--- a/docs/tutorials/tqdm_progress_bar.ipynb
+++ b/docs/tutorials/tqdm_progress_bar.ipynb
@@ -1,263 +1,266 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MyujzrAv2Vpk"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "rTUqXTqa2Vpm"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rNnfCHh82Vpq"
-      },
-      "source": [
-        "# TensorFlow Addons Callbacks: TQDM Progress Bar"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "4qrDJoTw2Vps"
-      },
-      "source": [
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/tqdm_progress_bar\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/tqdm_progress_bar.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/tqdm_progress_bar.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "      <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/tqdm_progress_bar.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "bVS_PkvX2Vpt"
-      },
-      "source": [
-        "## Overview\n",
-        "This notebook will demonstrate how to use TQDMCallback in TensorFlow Addons."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "sRldODz32Vpu"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "H0yZwcvcR4Gc"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -U tensorflow-addons"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "etYr-Suo4KYj"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -q \"tqdm>=4.36.1\"\n",
-        "\n",
-        "import tensorflow as tf\n",
-        "import tensorflow_addons as tfa\n",
-        "\n",
-        "from tensorflow.keras.datasets import mnist\n",
-        "from tensorflow.keras.models import Sequential\n",
-        "from tensorflow.keras.layers import Dense, Dropout, Flatten"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "SfXA0mI13pSE"
-      },
-      "outputs": [],
-      "source": [
-        "import tqdm\n",
-        "\n",
-        "# quietly deep-reload tqdm\n",
-        "import sys\n",
-        "from IPython.lib import deepreload \n",
-        "\n",
-        "stdout = sys.stdout\n",
-        "sys.stdout = open('junk','w')\n",
-        "deepreload.reload(tqdm)\n",
-        "sys.stdout = stdout\n",
-        "\n",
-        "tqdm.__version__"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2RGuwIwe2Vp7"
-      },
-      "source": [
-        "## Import and Normalize Data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "qKfrsOSP2Vp8"
-      },
-      "outputs": [],
-      "source": [
-        "# the data, split between train and test sets\n",
-        "(x_train, y_train), (x_test, y_test) = mnist.load_data()\n",
-        "# normalize data\n",
-        "x_train, x_test = x_train / 255.0, x_test / 255.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ORtL0s4X2VqB"
-      },
-      "source": [
-        "## Build Simple MNIST CNN Model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "z8uAGGV32VqC"
-      },
-      "outputs": [],
-      "source": [
-        "# build the model using the Sequential API\n",
-        "model = Sequential()\n",
-        "model.add(Flatten(input_shape=(28, 28)))\n",
-        "model.add(Dense(128, activation='relu'))\n",
-        "model.add(Dropout(0.2))\n",
-        "model.add(Dense(10, activation='softmax'))\n",
-        "\n",
-        "model.compile(optimizer='adam',\n",
-        "              loss = 'sparse_categorical_crossentropy',\n",
-        "              metrics=['accuracy'])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "YWOnH1ga2VqF"
-      },
-      "source": [
-        "## Default TQDMCallback Usage"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Vl_oj_OW2VqG"
-      },
-      "outputs": [],
-      "source": [
-        "# initialize tqdm callback with default parameters\n",
-        "tqdm_callback = tfa.callbacks.TQDMProgressBar()\n",
-        "\n",
-        "# train the model with tqdm_callback\n",
-        "# make sure to set verbose = 0 to disable\n",
-        "# the default progress bar.\n",
-        "model.fit(x_train, y_train,\n",
-        "          batch_size=64,\n",
-        "          epochs=10,\n",
-        "          verbose=0,\n",
-        "          callbacks=[tqdm_callback],\n",
-        "          validation_data=(x_test, y_test))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uFvBfwJN2VqK"
-      },
-      "source": [
-        "**Below is the expected output when you run the cell above**\n",
-        "![TQDM Progress Bar Figure](https://raw.githubusercontent.com/tensorflow/addons/59961669a0e21eb4c045d4ad38d008a529d566c2/docs/tutorials/assets/tqdm_progress_bar_demo.png)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Np3dD8bhe10E"
-      },
-      "outputs": [],
-      "source": [
-        "# TQDMProgressBar() also works with evaluate()\n",
-        "model.evaluate(x_test, y_test, batch_size=64, callbacks=[tqdm_callback], verbose=0)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "36WRBMo7e10I"
-      },
-      "source": [
-        "**Below is the expected output when you run the cell above**\n",
-        "![TQDM Evaluate Progress Bar Figure](https://user-images.githubusercontent.com/20843596/79412655-a57cef00-7fc3-11ea-9bea-ade8ee8dad58.PNG)"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "name": "tqdm_progress_bar.ipynb",
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "MyujzrAv2Vpk"
+   },
+   "source": [
+    "##### Copyright 2020 The TensorFlow Authors."
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "rTUqXTqa2Vpm"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rNnfCHh82Vpq"
+   },
+   "source": [
+    "# TensorFlow Addons Callbacks: TQDM Progress Bar"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "4qrDJoTw2Vps"
+   },
+   "source": [
+    "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://www.tensorflow.org/addons/tutorials/tqdm_progress_bar\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/tqdm_progress_bar.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <a target=\"_blank\" href=\"https://github.com/tensorflow/addons/blob/master/docs/tutorials/tqdm_progress_bar.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+    "  </td>\n",
+    "      <td>\n",
+    "    <a href=\"https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/tqdm_progress_bar.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+    "  </td>\n",
+    "</table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "bVS_PkvX2Vpt"
+   },
+   "source": [
+    "## Overview\n",
+    "This notebook will demonstrate how to use TQDMCallback in TensorFlow Addons."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "sRldODz32Vpu"
+   },
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "H0yZwcvcR4Gc"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -U tensorflow-addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "etYr-Suo4KYj"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q \"tqdm>=4.36.1\"\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import tensorflow_addons as tfa\n",
+    "\n",
+    "from tensorflow.keras.datasets import mnist\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import Dense, Dropout, Flatten"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "SfXA0mI13pSE"
+   },
+   "outputs": [],
+   "source": [
+    "import tqdm\n",
+    "\n",
+    "# quietly deep-reload tqdm\n",
+    "import sys\n",
+    "from IPython.lib import deepreload\n",
+    "\n",
+    "stdout = sys.stdout\n",
+    "sys.stdout = open(\"junk\", \"w\")\n",
+    "deepreload.reload(tqdm)\n",
+    "sys.stdout = stdout\n",
+    "\n",
+    "tqdm.__version__"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2RGuwIwe2Vp7"
+   },
+   "source": [
+    "## Import and Normalize Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qKfrsOSP2Vp8"
+   },
+   "outputs": [],
+   "source": [
+    "# the data, split between train and test sets\n",
+    "(x_train, y_train), (x_test, y_test) = mnist.load_data()\n",
+    "# normalize data\n",
+    "x_train, x_test = x_train / 255.0, x_test / 255.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ORtL0s4X2VqB"
+   },
+   "source": [
+    "## Build Simple MNIST CNN Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "z8uAGGV32VqC"
+   },
+   "outputs": [],
+   "source": [
+    "# build the model using the Sequential API\n",
+    "model = Sequential()\n",
+    "model.add(Flatten(input_shape=(28, 28)))\n",
+    "model.add(Dense(128, activation=\"relu\"))\n",
+    "model.add(Dropout(0.2))\n",
+    "model.add(Dense(10, activation=\"softmax\"))\n",
+    "\n",
+    "model.compile(\n",
+    "    optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", metrics=[\"accuracy\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YWOnH1ga2VqF"
+   },
+   "source": [
+    "## Default TQDMCallback Usage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Vl_oj_OW2VqG"
+   },
+   "outputs": [],
+   "source": [
+    "# initialize tqdm callback with default parameters\n",
+    "tqdm_callback = tfa.callbacks.TQDMProgressBar()\n",
+    "\n",
+    "# train the model with tqdm_callback\n",
+    "# make sure to set verbose = 0 to disable\n",
+    "# the default progress bar.\n",
+    "model.fit(\n",
+    "    x_train,\n",
+    "    y_train,\n",
+    "    batch_size=64,\n",
+    "    epochs=10,\n",
+    "    verbose=0,\n",
+    "    callbacks=[tqdm_callback],\n",
+    "    validation_data=(x_test, y_test),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uFvBfwJN2VqK"
+   },
+   "source": [
+    "**Below is the expected output when you run the cell above**\n",
+    "![TQDM Progress Bar Figure](https://raw.githubusercontent.com/tensorflow/addons/59961669a0e21eb4c045d4ad38d008a529d566c2/docs/tutorials/assets/tqdm_progress_bar_demo.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Np3dD8bhe10E"
+   },
+   "outputs": [],
+   "source": [
+    "# TQDMProgressBar() also works with evaluate()\n",
+    "model.evaluate(x_test, y_test, batch_size=64, callbacks=[tqdm_callback], verbose=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "36WRBMo7e10I"
+   },
+   "source": [
+    "**Below is the expected output when you run the cell above**\n",
+    "![TQDM Evaluate Progress Bar Figure](https://user-images.githubusercontent.com/20843596/79412655-a57cef00-7fc3-11ea-9bea-ade8ee8dad58.PNG)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "name": "tqdm_progress_bar.ipynb",
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }