diff --git a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml index c114314b..8f541c20 100644 --- a/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml +++ b/{{ cookiecutter.hosting_github_repo_name }}/.github/workflows/event-index-pipeline.yml @@ -1,9 +1,8 @@ name: Event Index on: - push: - branches: - - main + workflow_dispatch: + schedule: # # https://pubs.opengroup.org/onlinepubs/9699919799/utilities/crontab.html#tag_20_25_07 @@ -12,39 +11,128 @@ on: # We offset from the hour and half hour to go easy on the servers :) - cron: '26 3 * * 4' +# We doubly fan out +# We first generate indexs for uni, bi, and trigrams with a matrix +# Each index is split into chunks of 50,000 grams +# Then we fan out by every chunk and upload +{% raw %} jobs: - index-events: + generate-index-chunks: runs-on: ubuntu-latest strategy: matrix: - # We fan out on n-gram to make it possible to run on GitHub Actions n-gram: [1, 2, 3] + fail-fast: false + + outputs: + ngram-1-chunks: ${{ steps.output-index-chunks.outputs.ngram-1-chunks }} + ngram-2-chunks: ${{ steps.output-index-chunks.outputs.ngram-2-chunks }} + ngram-3-chunks: ${{ steps.output-index-chunks.outputs.ngram-3-chunks }} steps: + # Setup Runner - uses: actions/checkout@v2 - uses: actions/setup-python@v1 with: python-version: 3.9 + # Setup GCloud / Creds - name: Setup gcloud uses: google-github-actions/setup-gcloud@v0 with: - project_id: {{ cookiecutter.infrastructure_slug }} - service_account_key: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %} + project_id: {% endraw %}{{ cookiecutter.infrastructure_slug }}{% raw %} + service_account_key: ${{ secrets.GOOGLE_CREDENTIALS }} export_default_credentials: true + - name: Dump Credentials to JSON + run: | + echo "$GOOGLE_CREDS" > python/google-creds.json + env: + GOOGLE_CREDS: ${{ secrets.GOOGLE_CREDENTIALS }} + # Installs - name: Install Python Dependencies run: | cd python/ pip install . + + # Index + - name: Index Events ${{ matrix.n-gram }}-grams + run: | + cd python/ + run_cdp_event_index_generation event-index-config.json \ + --n_grams ${{ matrix.n-gram }} \ + --store_remote \ + --parallel + + # Store generated files to step output + - name: Store Index Fileset to Outputs + id: output-index-chunks + run: | + cd python/index/ + output=$(python -c 'import os, json; print(json.dumps(os.listdir(".")))') + echo "::set-output name=ngram-${{ matrix.n-gram }}-chunks::$output" + + combine-matrix-ngram-chunks: + needs: generate-index-chunks + runs-on: ubuntu-latest + outputs: + all-chunks: ${{ steps.combine-index-chunks.outputs.combined-chunks }} + + steps: + # Setup Runner + - uses: actions/checkout@v2 + - uses: actions/setup-python@v1 + with: + python-version: 3.9 + + # Process + - name: Combine Chunks + id: 'combine-index-chunks' + run: | + echo 'print(${{ needs.generate-index-chunks.outputs.ngram-1-chunks }} + ${{ needs.generate-index-chunks.outputs.ngram-2-chunks }} + ${{ needs.generate-index-chunks.outputs.ngram-3-chunks }})' >> print-combined-chunks.py + output=$(python print-combined-chunks.py) + echo "::set-output name=combined-chunks::$output" + + upload-index-chunks: + needs: combine-matrix-ngram-chunks + runs-on: ubuntu-latest + strategy: + max-parallel: 6 + matrix: + filename: ${{ fromJson(needs.combine-matrix-ngram-chunks.outputs.all-chunks) }} + fail-fast: false + + steps: + # Setup Runner + - uses: actions/checkout@v2 + - uses: actions/setup-python@v1 + with: + python-version: 3.9 + + # Setup GCloud / Creds + - name: Setup gcloud + uses: google-github-actions/setup-gcloud@v0 + with: + project_id: {% endraw $}{{ cookiecutter.infrastructure_slug }}{$ raw $} + service_account_key: ${{ secrets.GOOGLE_CREDENTIALS }} + export_default_credentials: true - name: Dump Credentials to JSON run: | echo "$GOOGLE_CREDS" > python/google-creds.json env: - GOOGLE_CREDS: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %} - - name: Index Events {% raw %}${{ matrix.n-gram }}{% endraw %}-grams + GOOGLE_CREDS: ${{ secrets.GOOGLE_CREDENTIALS }} + + # Installs + - name: Install Python Dependencies run: | cd python/ - {% raw %}run_cdp_event_index event-index-config.json \ - --n_grams ${{ matrix.n-gram }} \ - --parallel{% endraw %} + pip install . + + # Upload Index Chunk + - name: Process Upload + run: | + cd python/ + process_cdp_event_index_chunk event-index-config.json \ + ${{ matrix.filename }} \ + --parallel +{% endraw %} \ No newline at end of file