Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/split-indexing-into-multi-step-workflow #108

Merged
merged 7 commits into from Jun 15, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -1,9 +1,8 @@
name: Event Index

on:
push:
branches:
- main
workflow_dispatch:

schedule:
# <minute [0,59]> <hour [0,23]> <day of the month [1,31]> <month of the year [1,12]> <day of the week [0,6]>
# https://pubs.opengroup.org/onlinepubs/9699919799/utilities/crontab.html#tag_20_25_07
Expand All @@ -12,39 +11,128 @@ on:
# We offset from the hour and half hour to go easy on the servers :)
- cron: '26 3 * * 4'

# We doubly fan out
# We first generate indexs for uni, bi, and trigrams with a matrix
# Each index is split into chunks of 50,000 grams
# Then we fan out by every chunk and upload
{% raw %}
jobs:
index-events:
generate-index-chunks:
runs-on: ubuntu-latest
strategy:
matrix:
# We fan out on n-gram to make it possible to run on GitHub Actions
n-gram: [1, 2, 3]
fail-fast: false

outputs:
ngram-1-chunks: ${{ steps.output-index-chunks.outputs.ngram-1-chunks }}
ngram-2-chunks: ${{ steps.output-index-chunks.outputs.ngram-2-chunks }}
ngram-3-chunks: ${{ steps.output-index-chunks.outputs.ngram-3-chunks }}

steps:
# Setup Runner
- uses: actions/checkout@v2
- uses: actions/setup-python@v1
with:
python-version: 3.9

# Setup GCloud / Creds
- name: Setup gcloud
uses: google-github-actions/setup-gcloud@v0
with:
project_id: {{ cookiecutter.infrastructure_slug }}
service_account_key: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %}
project_id: {% endraw %}{{ cookiecutter.infrastructure_slug }}{% raw %}
service_account_key: ${{ secrets.GOOGLE_CREDENTIALS }}
export_default_credentials: true
- name: Dump Credentials to JSON
run: |
echo "$GOOGLE_CREDS" > python/google-creds.json
env:
GOOGLE_CREDS: ${{ secrets.GOOGLE_CREDENTIALS }}

# Installs
- name: Install Python Dependencies
run: |
cd python/
pip install .

# Index
- name: Index Events ${{ matrix.n-gram }}-grams
run: |
cd python/
run_cdp_event_index_generation event-index-config.json \
--n_grams ${{ matrix.n-gram }} \
--store_remote \
--parallel

# Store generated files to step output
- name: Store Index Fileset to Outputs
id: output-index-chunks
run: |
cd python/index/
output=$(python -c 'import os, json; print(json.dumps(os.listdir(".")))')
echo "::set-output name=ngram-${{ matrix.n-gram }}-chunks::$output"

combine-matrix-ngram-chunks:
needs: generate-index-chunks
runs-on: ubuntu-latest
outputs:
all-chunks: ${{ steps.combine-index-chunks.outputs.combined-chunks }}

steps:
# Setup Runner
- uses: actions/checkout@v2
- uses: actions/setup-python@v1
with:
python-version: 3.9

# Process
- name: Combine Chunks
id: 'combine-index-chunks'
run: |
echo 'print(${{ needs.generate-index-chunks.outputs.ngram-1-chunks }} + ${{ needs.generate-index-chunks.outputs.ngram-2-chunks }} + ${{ needs.generate-index-chunks.outputs.ngram-3-chunks }})' >> print-combined-chunks.py
output=$(python print-combined-chunks.py)
echo "::set-output name=combined-chunks::$output"

upload-index-chunks:
needs: combine-matrix-ngram-chunks
runs-on: ubuntu-latest
strategy:
max-parallel: 6
matrix:
filename: ${{ fromJson(needs.combine-matrix-ngram-chunks.outputs.all-chunks) }}
fail-fast: false

steps:
# Setup Runner
- uses: actions/checkout@v2
- uses: actions/setup-python@v1
with:
python-version: 3.9

# Setup GCloud / Creds
- name: Setup gcloud
uses: google-github-actions/setup-gcloud@v0
with:
project_id: {% endraw $}{{ cookiecutter.infrastructure_slug }}{$ raw $}
service_account_key: ${{ secrets.GOOGLE_CREDENTIALS }}
export_default_credentials: true
- name: Dump Credentials to JSON
run: |
echo "$GOOGLE_CREDS" > python/google-creds.json
env:
GOOGLE_CREDS: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %}
- name: Index Events {% raw %}${{ matrix.n-gram }}{% endraw %}-grams
GOOGLE_CREDS: ${{ secrets.GOOGLE_CREDENTIALS }}

# Installs
- name: Install Python Dependencies
run: |
cd python/
{% raw %}run_cdp_event_index event-index-config.json \
--n_grams ${{ matrix.n-gram }} \
--parallel{% endraw %}
pip install .

# Upload Index Chunk
- name: Process Upload
run: |
cd python/
process_cdp_event_index_chunk event-index-config.json \
${{ matrix.filename }} \
--parallel
{% endraw %}