notebook + readme (#3652)

HTTPArchive · May 7, 2024 · 994a058 · 994a058
1 parent 2d080ed
commit 994a058
Show file tree

Hide file tree

Showing 2 changed files with 237 additions and 1 deletion.
diff --git a/sql/util/README.md b/sql/util/README.md
@@ -24,10 +24,14 @@ This query generates a list of candidate URLs for manifest and service worker fi
 
 The `almanac.manifests` and `almanac.service_workers` tables depend on the `pwa_candidates` table. Running these queries will generate the latest data that can be appended to their respective tables.
 
-## green_web_foundation
+## [green_web_foundation.sql](./green_web_foundation.sql)
 
 1. Go to https://admin.thegreenwebfoundation.org/admin/green-urls
 2. Scroll to the bottom for the latest database dump
 3. Convert to a BQ-compatible format, ie CSV
 4. Import into a temporary BQ table
 5. Join with the date-partitioned `green_web_foundation` table
+
+## [bq_sql_to_spreadsheet.ipynb](./bq_to_sheets.ipynb)
+
+This Jupyter notebook runs BigQuery SQL queries for a chapter and saves the results to a Google Sheet. It uses the `gspread` library to interact with Google Sheets.
diff --git a/sql/util/bq_to_sheets.ipynb b/sql/util/bq_to_sheets.ipynb
@@ -0,0 +1,232 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/HTTPArchive/almanac.httparchive.org/blob/fellow-vicuna/sql/util/bq_to_sheets.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Almanac\n",
+        "CHAPTER = \"privacy\"\n",
+        "YEAR = \"2024\"\n",
+        "\n",
+        "# BigQuery\n",
+        "GCP_PROJECT = \"httparchive\"\n",
+        "\n",
+        "# Git\n",
+        "BRANCH_NAME = \"{chapter}-sql-{year}\".format(\n",
+        "    chapter=CHAPTER,\n",
+        "    year=YEAR\n",
+        ")\n",
+        "\n",
+        "# SQL folder\n",
+        "folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n",
+        "    year=YEAR,\n",
+        "    chapter=CHAPTER\n",
+        ")\n",
+        "\n",
+        "# Google Sheets\n",
+        "spreadsheet_name = \"{chapter} (Web Almanac {year})\".format(\n",
+        "    chapter=CHAPTER.capitalize(),\n",
+        "    year=YEAR\n",
+        ")\n",
+        "\n",
+        "# Set to `None` to create new one or an existing spreadsheet URL.\n",
+        "existing_spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0/edit'"
+      ],
+      "metadata": {
+        "id": "U37785Bxt5tE"
+      },
+      "execution_count": 1,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "OVkCxlRQH6Yt",
+        "outputId": "9fb31f97-8541-461a-991f-e7932da56101"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Cloning into 'almanac.httparchive.org'...\n",
+            "remote: Enumerating objects: 43942, done.\u001b[K\n",
+            "remote: Counting objects: 100% (5935/5935), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (1535/1535), done.\u001b[K\n",
+            "remote: Total 43942 (delta 4709), reused 4950 (delta 4391), pack-reused 38007\u001b[K\n",
+            "Receiving objects: 100% (43942/43942), 384.14 MiB | 29.81 MiB/s, done.\n",
+            "Resolving deltas: 100% (29622/29622), done.\n",
+            "Updating files: 100% (5472/5472), done.\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Download repo\n",
+        "!git clone -b $BRANCH_NAME https://github.com/HTTPArchive/almanac.httparchive.org.git"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "UzhgG5xvbQ1E",
+        "outputId": "4dfc6202-2034-49bd-a77c-5a6e00e01bea"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Already on 'privacy-sql-2024'\n",
+            "Your branch is up to date with 'origin/privacy-sql-2024'.\n",
+            "Already up to date.\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Update local branch\n",
+        "!cd almanac.httparchive.org/ && git checkout $BRANCH_NAME && git pull"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "45dBifFPJAtO"
+      },
+      "outputs": [],
+      "source": [
+        "# Authenticate\n",
+        "import google.auth\n",
+        "import os\n",
+        "from google.colab import auth\n",
+        "from google.cloud import bigquery\n",
+        "\n",
+        "import gspread\n",
+        "from gspread_dataframe import set_with_dataframe\n",
+        "\n",
+        "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n",
+        "auth.authenticate_user()\n",
+        "credentials, project = google.auth.default()\n",
+        "client = bigquery.Client()\n",
+        "gc = gspread.authorize(credentials)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "nblNil985Tjt",
+        "outputId": "ccde5268-430c-4ecc-b99c-fce20d061ec8"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Using existing spreadsheet: https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0\n"
+          ]
+        }
+      ],
+      "source": [
+        "import glob\n",
+        "import re\n",
+        "\n",
+        "# Build Sheets\n",
+        "try:\n",
+        "  ss = gc.open_by_url(existing_spreadsheet_url)\n",
+        "  print('Using existing spreadsheet:', ss.url)\n",
+        "except:\n",
+        "  ss = gc.create(spreadsheet_name)\n",
+        "  print('Created a new spreadsheet:', spreadsheet_name, ss.url)\n",
+        "existing_sheets = [s.title for s in ss.worksheets()]\n",
+        "\n",
+        "file_match_include = r\"number_of_websites_with_features_based_on_string_search.sql\"+\"|\"+ \\\n",
+        "    \"number_of_websites_with_origin_trial_from_token.sql\"\n",
+        "\n",
+        "file_match_exclude = r\"^$\"\n",
+        "\n",
+        "overwrite = False\n",
+        "dry_run = True\n",
+        "tb_processed_limit = 0.1\n",
+        "\n",
+        "# Find matching .sql queries in folder and save to google sheet.\n",
+        "for filepath in glob.iglob(folder):\n",
+        "    filename = filepath.split('/')[-1]\n",
+        "    sheet_title = re.sub(r\"(\\.sql|[^a-zA-Z0-9]+)\", \" \", filename).strip().title()\n",
+        "\n",
+        "    if re.search(file_match_include, filename) and not re.search(file_match_exclude, filename):\n",
+        "\n",
+        "        print('Processing:', sheet_title)\n",
+        "        with open(filepath) as f:\n",
+        "            query = f.read()\n",
+        "\n",
+        "        response = client.query(\n",
+        "            query,\n",
+        "            job_config = bigquery.QueryJobConfig(dry_run = True)\n",
+        "        )\n",
+        "\n",
+        "        tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n",
+        "        print(f\"Total Tb billed:{tb_processed:9.3f}\")\n",
+        "\n",
+        "        if dry_run:\n",
+        "            continue\n",
+        "\n",
+        "        if tb_processed > tb_processed_limit:\n",
+        "            print('Data volume hit the limit. Skipping:', sheet_title)\n",
+        "            continue\n",
+        "\n",
+        "        if sheet_title in existing_sheets:\n",
+        "            if not overwrite:\n",
+        "                print('Overwrite is False. Skipping:', sheet_title)\n",
+        "                continue\n",
+        "\n",
+        "            else:\n",
+        "                st = ss.worksheet(sheet_title)\n",
+        "                ss.del_worksheet(st)\n",
+        "\n",
+        "        df = client.query(query).to_dataframe()\n",
+        "        rows, cols = df.shape\n",
+        "\n",
+        "        st = ss.add_worksheet(title = sheet_title, rows = rows, cols = cols)\n",
+        "        set_with_dataframe(st, df)\n",
+        "\n",
+        "    else:\n",
+        "        print('Not Matched. Skipping:', sheet_title)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}