Skip to content

Commit

Permalink
notebook + readme (#3652)
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko committed May 7, 2024
1 parent 2d080ed commit 994a058
Show file tree
Hide file tree
Showing 2 changed files with 237 additions and 1 deletion.
6 changes: 5 additions & 1 deletion sql/util/README.md
Expand Up @@ -24,10 +24,14 @@ This query generates a list of candidate URLs for manifest and service worker fi

The `almanac.manifests` and `almanac.service_workers` tables depend on the `pwa_candidates` table. Running these queries will generate the latest data that can be appended to their respective tables.

## green_web_foundation
## [green_web_foundation.sql](./green_web_foundation.sql)

1. Go to https://admin.thegreenwebfoundation.org/admin/green-urls
2. Scroll to the bottom for the latest database dump
3. Convert to a BQ-compatible format, ie CSV
4. Import into a temporary BQ table
5. Join with the date-partitioned `green_web_foundation` table

## [bq_sql_to_spreadsheet.ipynb](./bq_to_sheets.ipynb)

This Jupyter notebook runs BigQuery SQL queries for a chapter and saves the results to a Google Sheet. It uses the `gspread` library to interact with Google Sheets.
232 changes: 232 additions & 0 deletions sql/util/bq_to_sheets.ipynb
@@ -0,0 +1,232 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/HTTPArchive/almanac.httparchive.org/blob/fellow-vicuna/sql/util/bq_to_sheets.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"# Almanac\n",
"CHAPTER = \"privacy\"\n",
"YEAR = \"2024\"\n",
"\n",
"# BigQuery\n",
"GCP_PROJECT = \"httparchive\"\n",
"\n",
"# Git\n",
"BRANCH_NAME = \"{chapter}-sql-{year}\".format(\n",
" chapter=CHAPTER,\n",
" year=YEAR\n",
")\n",
"\n",
"# SQL folder\n",
"folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(\n",
" year=YEAR,\n",
" chapter=CHAPTER\n",
")\n",
"\n",
"# Google Sheets\n",
"spreadsheet_name = \"{chapter} (Web Almanac {year})\".format(\n",
" chapter=CHAPTER.capitalize(),\n",
" year=YEAR\n",
")\n",
"\n",
"# Set to `None` to create new one or an existing spreadsheet URL.\n",
"existing_spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0/edit'"
],
"metadata": {
"id": "U37785Bxt5tE"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "OVkCxlRQH6Yt",
"outputId": "9fb31f97-8541-461a-991f-e7932da56101"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Cloning into 'almanac.httparchive.org'...\n",
"remote: Enumerating objects: 43942, done.\u001b[K\n",
"remote: Counting objects: 100% (5935/5935), done.\u001b[K\n",
"remote: Compressing objects: 100% (1535/1535), done.\u001b[K\n",
"remote: Total 43942 (delta 4709), reused 4950 (delta 4391), pack-reused 38007\u001b[K\n",
"Receiving objects: 100% (43942/43942), 384.14 MiB | 29.81 MiB/s, done.\n",
"Resolving deltas: 100% (29622/29622), done.\n",
"Updating files: 100% (5472/5472), done.\n"
]
}
],
"source": [
"# Download repo\n",
"!git clone -b $BRANCH_NAME https://github.com/HTTPArchive/almanac.httparchive.org.git"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UzhgG5xvbQ1E",
"outputId": "4dfc6202-2034-49bd-a77c-5a6e00e01bea"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Already on 'privacy-sql-2024'\n",
"Your branch is up to date with 'origin/privacy-sql-2024'.\n",
"Already up to date.\n"
]
}
],
"source": [
"# Update local branch\n",
"!cd almanac.httparchive.org/ && git checkout $BRANCH_NAME && git pull"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "45dBifFPJAtO"
},
"outputs": [],
"source": [
"# Authenticate\n",
"import google.auth\n",
"import os\n",
"from google.colab import auth\n",
"from google.cloud import bigquery\n",
"\n",
"import gspread\n",
"from gspread_dataframe import set_with_dataframe\n",
"\n",
"os.environ[\"GOOGLE_CLOUD_PROJECT\"] = GCP_PROJECT\n",
"auth.authenticate_user()\n",
"credentials, project = google.auth.default()\n",
"client = bigquery.Client()\n",
"gc = gspread.authorize(credentials)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nblNil985Tjt",
"outputId": "ccde5268-430c-4ecc-b99c-fce20d061ec8"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Using existing spreadsheet: https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0\n"
]
}
],
"source": [
"import glob\n",
"import re\n",
"\n",
"# Build Sheets\n",
"try:\n",
" ss = gc.open_by_url(existing_spreadsheet_url)\n",
" print('Using existing spreadsheet:', ss.url)\n",
"except:\n",
" ss = gc.create(spreadsheet_name)\n",
" print('Created a new spreadsheet:', spreadsheet_name, ss.url)\n",
"existing_sheets = [s.title for s in ss.worksheets()]\n",
"\n",
"file_match_include = r\"number_of_websites_with_features_based_on_string_search.sql\"+\"|\"+ \\\n",
" \"number_of_websites_with_origin_trial_from_token.sql\"\n",
"\n",
"file_match_exclude = r\"^$\"\n",
"\n",
"overwrite = False\n",
"dry_run = True\n",
"tb_processed_limit = 0.1\n",
"\n",
"# Find matching .sql queries in folder and save to google sheet.\n",
"for filepath in glob.iglob(folder):\n",
" filename = filepath.split('/')[-1]\n",
" sheet_title = re.sub(r\"(\\.sql|[^a-zA-Z0-9]+)\", \" \", filename).strip().title()\n",
"\n",
" if re.search(file_match_include, filename) and not re.search(file_match_exclude, filename):\n",
"\n",
" print('Processing:', sheet_title)\n",
" with open(filepath) as f:\n",
" query = f.read()\n",
"\n",
" response = client.query(\n",
" query,\n",
" job_config = bigquery.QueryJobConfig(dry_run = True)\n",
" )\n",
"\n",
" tb_processed = response.total_bytes_processed/1024/1024/1024/1024\n",
" print(f\"Total Tb billed:{tb_processed:9.3f}\")\n",
"\n",
" if dry_run:\n",
" continue\n",
"\n",
" if tb_processed > tb_processed_limit:\n",
" print('Data volume hit the limit. Skipping:', sheet_title)\n",
" continue\n",
"\n",
" if sheet_title in existing_sheets:\n",
" if not overwrite:\n",
" print('Overwrite is False. Skipping:', sheet_title)\n",
" continue\n",
"\n",
" else:\n",
" st = ss.worksheet(sheet_title)\n",
" ss.del_worksheet(st)\n",
"\n",
" df = client.query(query).to_dataframe()\n",
" rows, cols = df.shape\n",
"\n",
" st = ss.add_worksheet(title = sheet_title, rows = rows, cols = cols)\n",
" set_with_dataframe(st, df)\n",
"\n",
" else:\n",
" print('Not Matched. Skipping:', sheet_title)"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

0 comments on commit 994a058

Please sign in to comment.