feat: python notebooks template repo (#1195)

* adding python_notebooks template to synthtool * adding python notebook temlate repo * adding python notebooks template * updating .cloud-build to recent versions * addressing kokoro fail * streamlining purpose of PR to cover only our shared testing pipeline * updating check for Python version and adding a licence to cloud-build yaml * removing PR templates * updating nbQA version * upgrading nbqa and black, running black directly * revising cloud-build files to fit with kokoro * updating versions of cloud-build suite * addressed comments on cleanup setup * make requirements more compatible with renovatebot * pinning versions * used default artifacts pattern instead of manual copying of artifacts * reformatting commands * adding defaults * reverting to previous cloud-build.yaml file
googleapis · Oct 19, 2021 · 02cd0ee · 02cd0ee
1 parent d45942b
commit 02cd0ee
Show file tree

Hide file tree

Showing 16 changed files with 1,059 additions and 0 deletions.
diff --git a/synthtool/gcp/common.py b/synthtool/gcp/common.py
@@ -205,6 +205,12 @@ def py_samples_override(
         overridden_samples_kwargs["subdir"] = override_path
         return self._generic_library("python_samples", **overridden_samples_kwargs)
 
+    def python_notebooks(self, **kwargs) -> Path:
+        # kwargs["metadata"] is required to load values from .repo-metadata.json
+        if "metadata" not in kwargs:
+            kwargs["metadata"] = {}
+        return self._generic_library("python_notebooks", **kwargs)
+
     def py_library(self, **kwargs) -> Path:
         # kwargs["metadata"] is required to load values from .repo-metadata.json
         if "metadata" not in kwargs:

diff --git a/synthtool/gcp/templates/python_notebooks_testing_pipeline/.cloud-build/CheckPythonVersion.py b/synthtool/gcp/templates/python_notebooks_testing_pipeline/.cloud-build/CheckPythonVersion.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# # Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+MINIMUM_MAJOR_VERSION = 3
+MINIMUM_MINOR_VERSION = 5
+
+if (
+    sys.version_info.major >= MINIMUM_MAJOR_VERSION
+    or sys.version_info.minor >= MINIMUM_MINOR_VERSION
+):
+    print(f"Python version acceptable: {sys.version}")
+    exit(0)
+else:
+    print(
+        f"Error: Python version less than {MINIMUM_MAJOR_VERSION}.{MINIMUM_MINOR_VERSION}"
+    )
+    exit(1)
diff --git a/...l/gcp/templates/python_notebooks_testing_pipeline/.cloud-build/ExecuteChangedNotebooks.py b/...l/gcp/templates/python_notebooks_testing_pipeline/.cloud-build/ExecuteChangedNotebooks.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python
+# # Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import dataclasses
+import datetime
+import functools
+import pathlib
+import os
+import subprocess
+from pathlib import Path
+from typing import List, Optional
+import concurrent
+from tabulate import tabulate
+
+import ExecuteNotebook
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def format_timedelta(delta: datetime.timedelta) -> str:
+    """Formats a timedelta duration to [N days] %H:%M:%S format"""
+    seconds = int(delta.total_seconds())
+
+    secs_in_a_day = 86400
+    secs_in_a_hour = 3600
+    secs_in_a_min = 60
+
+    days, seconds = divmod(seconds, secs_in_a_day)
+    hours, seconds = divmod(seconds, secs_in_a_hour)
+    minutes, seconds = divmod(seconds, secs_in_a_min)
+
+    time_fmt = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
+
+    if days > 0:
+        suffix = "s" if days > 1 else ""
+        return f"{days} day{suffix} {time_fmt}"
+
+    return time_fmt
+
+
+@dataclasses.dataclass
+class NotebookExecutionResult:
+    notebook: str
+    duration: datetime.timedelta
+    is_pass: bool
+    error_message: Optional[str]
+
+
+def execute_notebook(
+    artifacts_path: str,
+    variable_project_id: str,
+    variable_region: str,
+    should_log_output: bool,
+    should_use_new_kernel: bool,
+    notebook: str,
+) -> NotebookExecutionResult:
+    print(f"Running notebook: {notebook}")
+
+    result = NotebookExecutionResult(
+        notebook=notebook,
+        duration=datetime.timedelta(seconds=0),
+        is_pass=False,
+        error_message=None,
+    )
+
+    time_start = datetime.datetime.now()
+    try:
+        ExecuteNotebook.execute_notebook(
+            notebook_file_path=notebook,
+            output_file_folder=artifacts_path,
+            replacement_map={
+                "PROJECT_ID": variable_project_id,
+                "REGION": variable_region,
+            },
+            should_log_output=should_log_output,
+            should_use_new_kernel=should_use_new_kernel,
+        )
+        result.duration = datetime.datetime.now() - time_start
+        result.is_pass = True
+        print(f"{notebook} PASSED in {format_timedelta(result.duration)}.")
+    except Exception as error:
+        result.duration = datetime.datetime.now() - time_start
+        result.is_pass = False
+        result.error_message = str(error)
+        print(
+            f"{notebook} FAILED in {format_timedelta(result.duration)}: {result.error_message}"
+        )
+
+    return result
+
+
+def run_changed_notebooks(
+    test_paths_file: str,
+    base_branch: Optional[str],
+    output_folder: str,
+    variable_project_id: str,
+    variable_region: str,
+    should_parallelize: bool,
+    should_use_separate_kernels: bool,
+):
+    """
+    Run the notebooks that exist under the folders defined in the test_paths_file.
+    It only runs notebooks that have differences from the Git base_branch.
+    The executed notebooks are saved in the output_folder.
+    Variables are also injected into the notebooks such as the variable_project_id and variable_region.
+    Args:
+        test_paths_file (str):
+            Required. The new-line delimited file to folders and files that need checking.
+            Folders are checked recursively.
+        base_branch (str):
+            Optional. If provided, only the files that have changed from the base_branch will be checked.
+            If not provided, all files will be checked.
+        output_folder (str):
+            Required. The folder to write executed notebooks to.
+        variable_project_id (str):
+            Required. The value for PROJECT_ID to inject into notebooks.
+        variable_region (str):
+            Required. The value for REGION to inject into notebooks.
+        should_parallelize (bool):
+            Required. Should run notebooks in parallel using a thread pool as opposed to in sequence.
+        should_use_separate_kernels (bool):
+            Note: Dependencies don't install correctly when this is set to True
+            See https://github.com/nteract/papermill/issues/625
+            Required. Should run each notebook in a separate and independent virtual environment.
+    """
+
+    test_paths = []
+    with open(test_paths_file) as file:
+        lines = [line.strip() for line in file.readlines()]
+        lines = [line for line in lines if len(line) > 0]
+        test_paths = [line for line in lines]
+
+    if len(test_paths) == 0:
+        raise RuntimeError("No test folders found.")
+
+    print(f"Checking folders: {test_paths}")
+
+    # Find notebooks
+    notebooks = []
+    if base_branch:
+        print(f"Looking for notebooks that changed from branch: {base_branch}")
+        notebooks = subprocess.check_output(
+            ["git", "diff", "--name-only", f"origin/{base_branch}..."] + test_paths
+        )
+    else:
+        print("Looking for all notebooks.")
+        notebooks = subprocess.check_output(["git", "ls-files"] + test_paths)
+
+    notebooks = notebooks.decode("utf-8").split("\n")
+    notebooks = [notebook for notebook in notebooks if notebook.endswith(".ipynb")]
+    notebooks = [notebook for notebook in notebooks if len(notebook) > 0]
+    notebooks = [notebook for notebook in notebooks if Path(notebook).exists()]
+
+    # Create paths
+    artifacts_path = Path(output_folder)
+    artifacts_path.mkdir(parents=True, exist_ok=True)
+    artifacts_path.joinpath("success").mkdir(parents=True, exist_ok=True)
+    artifacts_path.joinpath("failure").mkdir(parents=True, exist_ok=True)
+
+    notebook_execution_results: List[NotebookExecutionResult] = []
+
+    if len(notebooks) > 0:
+        print(f"Found {len(notebooks)} modified notebooks: {notebooks}")
+
+        if should_parallelize and len(notebooks) > 1:
+            print(
+                "Running notebooks in parallel, so no logs will be displayed. Please wait..."
+            )
+            with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
+                notebook_execution_results = list(
+                    executor.map(
+                        functools.partial(
+                            execute_notebook,
+                            artifacts_path,
+                            variable_project_id,
+                            variable_region,
+                            False,
+                            should_use_separate_kernels,
+                        ),
+                        notebooks,
+                    )
+                )
+        else:
+            notebook_execution_results = [
+                execute_notebook(
+                    artifacts_path=artifacts_path,
+                    variable_project_id=variable_project_id,
+                    variable_region=variable_region,
+                    notebook=notebook,
+                    should_log_output=True,
+                    should_use_new_kernel=should_use_separate_kernels,
+                )
+                for notebook in notebooks
+            ]
+    else:
+        print("No notebooks modified in this pull request.")
+
+    print("\n=== RESULTS ===\n")
+
+    notebooks_sorted = sorted(
+        notebook_execution_results, key=lambda result: result.is_pass, reverse=True,
+    )
+    # Print results
+    print(
+        tabulate(
+            [
+                [
+                    os.path.basename(os.path.normpath(result.notebook)),
+                    "PASSED" if result.is_pass else "FAILED",
+                    format_timedelta(result.duration),
+                    result.error_message or "--",
+                ]
+                for result in notebooks_sorted
+            ],
+            headers=["file", "status", "duration", "error"],
+        )
+    )
+
+    print("\n=== END RESULTS===\n")
+
+
+parser = argparse.ArgumentParser(description="Run changed notebooks.")
+parser.add_argument(
+    "--test_paths_file",
+    type=pathlib.Path,
+    help="The path to the file that has newline-limited folders of notebooks that should be tested.",
+    required=True,
+)
+parser.add_argument(
+    "--base_branch",
+    help="The base git branch to diff against to find changed files.",
+    required=False,
+)
+parser.add_argument(
+    "--output_folder",
+    type=pathlib.Path,
+    help="The path to the folder to store executed notebooks.",
+    required=True,
+)
+parser.add_argument(
+    "--variable_project_id",
+    type=str,
+    help="The GCP project id. This is used to inject a variable value into the notebook before running.",
+    required=True,
+)
+parser.add_argument(
+    "--variable_region",
+    type=str,
+    help="The GCP region. This is used to inject a variable value into the notebook before running.",
+    required=True,
+)
+
+# Note: Dependencies don't install correctly when this is set to True
+parser.add_argument(
+    "--should_parallelize",
+    type=str2bool,
+    nargs="?",
+    const=True,
+    default=False,
+    help="Should run notebooks in parallel.",
+)
+
+# Note: This isn't guaranteed to work correctly due to existing Papermill issue
+# See https://github.com/nteract/papermill/issues/625
+parser.add_argument(
+    "--should_use_separate_kernels",
+    type=str2bool,
+    nargs="?",
+    const=True,
+    default=False,
+    help="(Experimental) Should run each notebook in a separate and independent virtual environment.",
+)
+
+args = parser.parse_args()
+run_changed_notebooks(
+    test_paths_file=args.test_paths_file,
+    base_branch=args.base_branch,
+    output_folder=args.output_folder,
+    variable_project_id=args.variable_project_id,
+    variable_region=args.variable_region,
+    should_parallelize=args.should_parallelize,
+    should_use_separate_kernels=args.should_use_separate_kernels,
+)